In [1]:
def prepare_data():
    import pandas as pd
    df = pd.read_csv("https://raw.githubusercontent.com/TripathiAshutosh/dataset/main/iris.csv")
    print(df.isnull().sum())
    df.dropna()
    df.to_csv(f'data/final_df.csv',index=False)
    print("\n ---- data csv is saved to PV location /data/final_df.csv ----")

In [2]:
def train_test_split():
    import pandas as pd
    from sklearn.model_selection import train_test_split
    print("---- Inside train_test_split component ----")
    final_data = pd.read_csv(f'data/final_df.csv')
    target_column = 'class'

    X = final_data.drop(columns=target_column)
    y = final_data[target_column]

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=47)

    X_train.to_csv(f'data/X_train.csv', index=False)
    X_test.to_csv(f'data/X_test.csv', index=False)
    y_train.to_csv(f'data/y_train.csv', index=False)
    y_test.to_csv(f'data/y_test.csv', index=False)

    print("\n---- X_train ----")
    print("\n")
    print(X_train)
    
    print("\n---- X_test ----")
    print("\n")
    print(X_test)
    
    print("\n---- y_train ----")
    print("\n")
    print(y_train)
    
    print("\n---- y_test ----")
    print("\n")
    print(y_test)
    

In [3]:
def training_basic_classifier():
    import pandas as pd
    from sklearn.linear_model import LogisticRegression
    import numpy as np

    print("---- Inside training_basic_classifier component ----")

    X_train = pd.read_csv('data/X_train.csv')
    y_train = pd.read_csv('data/y_train.csv')

    classifier = LogisticRegression(max_iter=500)
    classifier.fit(X_train,y_train)

    import pickle
    with open (f'data/model.pkl','wb') as f:
        pickle.dump(classifier,f)

    print("\n logistic regression classifier is trained on iris data and saved to PV location /data/model.pkl ----")

In [4]:
def predict_on_test_data():
    import pandas as pd
    import numpy as np
    import pickle
    print("---- Inside predict_on_test_data component ----")

    with open(f'data/model.pkl','rb') as f:
        logistic_reg_model = pickle.load(f)

    X_test = pd.read_csv('data/X_test.csv')
    y_pred = logistic_reg_model.predict(X_test)
    np.save(f'data/y_pred.npy', y_pred)
    
    print("\n---- Predicted classes ----")
    print("\n")
    print(y_pred)

In [5]:
def predict_prob_on_test_data():
    import pandas as pd
    import numpy as np
    import pickle
    print("---- Inside predict_prob_on_test_data component ----")
    with open(f'data/model.pkl','rb') as f:
        logistic_reg_model = pickle.load(f)
    X_test = pd.read_csv('data/X_test.csv')
    y_pred_prob = logistic_reg_model.predict_proba(X_test)
    np.save('data/y_pred_prob.npy', y_pred_prob)
    
    print("\n---- Predicted Probabilities ----")
    print("\n")
    print(y_pred_prob)

In [6]:
def get_metrics():
    import pandas as pd
    import numpy as np
    from sklearn.metrics import accuracy_score,precision_score,recall_score,log_loss
    from sklearn import metrics
    print("---- Inside get_metrics component ----")
    y_test = pd.read_csv('data/y_test.csv')
    y_pred = np.load('data/y_pred.npy',allow_pickle=True)
    y_pred_prob = np.load(f'data/y_pred_prob.npy',allow_pickle=True)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred,average='micro')
    recall = recall_score(y_test, y_pred,average='micro')
    entropy = log_loss(y_test, y_pred_prob)
    
    print(metrics.classification_report(y_test, y_pred))
    print(y_pred)
    print(y_pred_prob)
    print("\n Model Metrics:", {'accuracy': round(acc, 2), 'precision': round(prec, 2), 'recall': round(recall, 2), 'entropy': round(entropy, 2)})

In [7]:
prepare_data()
train_test_split()
training_basic_classifier()
predict_on_test_data()
predict_prob_on_test_data()
get_metrics()

sepal-length    0
sepal-width     0
petal-length    0
petal-width     0
class           0
dtype: int64

 ---- data csv is saved to PV location /data/final_df.csv ----
---- Inside train_test_split component ----

---- X_train ----


     sepal-length  sepal-width  petal-length  petal-width
22            4.6          3.6           1.0          0.2
125           7.2          3.2           6.0          1.8
30            4.8          3.1           1.6          0.2
103           6.3          2.9           5.6          1.8
146           6.3          2.5           5.0          1.9
..            ...          ...           ...          ...
72            6.3          2.5           4.9          1.5
8             4.4          2.9           1.4          0.2
71            6.1          2.8           4.0          1.3
134           6.1          2.6           5.6          1.4
135           7.7          3.0           6.1          2.3

[105 rows x 4 columns]

---- X_test ----


     sepal-length  sepal-wid

  y = column_or_1d(y, warn=True)
