# Train diabetes classification model

This notebook reads a CSV file and trains a model to predict diabetes in patients. The data is already preprocessed and requires no feature engineering.

The evaluation methods were used during experimentation to decide whether the model was accurate enough. Moving forward, there's a preference to use the autolog feature of MLflow to more easily deploy the model later on.

## Read data from local file



In [1]:
import pandas as pd

print("Reading data...")
df = pd.read_csv('diabetes.csv')
df.head()

Reading data...


Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.28287,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0


## Split data

In [2]:
print("Splitting data...")
X, y = df[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, df['Diabetic'].values

Splitting data...


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

## Train model

In [None]:
from sklearn.linear_model import LogisticRegression

print("Training model...")
model = LogisticRegression(C=1/0.1, solver="liblinear").fit(X_train, y_train)

## Evaluate model

In [None]:
import numpy as np

y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)

print('Accuracy:', acc)

In [None]:
from sklearn.metrics import roc_auc_score

y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])

print('AUC: ' + str(auc))

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
fig = plt.figure(figsize=(6, 4))
# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], 'k--')
# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')

In [4]:
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score


def main(args):
    # read data
    df = get_data(args.training_data)

    # split data
    X_train, X_test, y_train, y_test = split_data(df)

    # train model
    model = train_model(args.reg_rate, args.max_iter, X_train, y_train)

    # evaluate model
    eval_model(model, X_test, y_test)


def get_data(path):
    print("Reading data...")
    df = pd.read_csv(path)
    
    return df

def split_data(df):
    print("Splitting data...")
    X, y = df[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness',
    'SerumInsulin','BMI','DiabetesPedigree','Age']].values, df['Diabetic'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

    return X_train, X_test, y_train, y_test

def train_model(reg_rate, max_iteration ,X_train, y_train):
    print("Training model...")
    model = LogisticRegression(C=1/reg_rate, solver="liblinear", max_iter=max_iteration).fit(X_train, y_train)

    return model


def getAccuracyScore(Y_actual, Y_prediction):
    """Get acuracy score in pct

    Args:
        Y_actual (_type_): Y_test or Y actual values
        Y_prediction (_type_): predicetd value

    Returns:
        _type_: accuracy in pct format
    """
    return '{:.4f}%'.format(accuracy_score(Y_actual, Y_prediction) * 100)

def eval_model(model, X_test, y_test):
    y_hat = model.predict(X_test)
    print('Accuracy:', getAccuracyScore(y_test, y_hat))

    y_scores = model.predict_proba(X_test)
    auc = roc_auc_score(y_test,y_scores[:,1])
    print('AUC: ' + str(auc))

    tn,fp,fn,tp = confusion_matrix(y_test, y_hat).ravel()
    f1_score = tp/(tp+((fn+fp)/2))
    recall = tp/(tp+fn)
    precision = tp/(tp+fp)

    print(f"F1 Score: {'{:.4f}'.format(f1_score)}")
    print(f"Recall: {'{:.4f}'.format(recall)}")
    print(f"Precision: {'{:.4f}'.format(precision)}")



In [6]:
# read data
df = get_data('diabetes.csv')

# split data
X_train, X_test, y_train, y_test = split_data(df)

# train model
model = train_model(10, 1000, X_train, y_train)

# evaluate model
eval_model(model, X_test, y_test)

Reading data...
Splitting data...
Training model...
Accuracy: 76.6667%
AUC: 0.8407836210756754
F1 Score: 0.6187
Recall: 0.5624
Precision: 0.6877
