In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
import pickle

In [2]:
# loading the diabetes dataset
dataset = pd.read_csv('diabetes.csv')

In [3]:
print(dataset.shape)

(768, 9)


In [4]:
dataset.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [5]:
# separating the data and labels
X = dataset.drop(columns = 'Outcome', axis=1)
y = dataset['Outcome']

In [6]:
print(X.shape)
print(y.shape)

(768, 8)
(768,)


In [7]:
# Handle imbalanced data
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [9]:
# Define classifiers
classifiers = RandomForestClassifier()
# Training and evaluating each classifier
model = make_pipeline(StandardScaler(), classifiers)

In [10]:
# Cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print(f"cross_val_score: {cv_scores}")
mean_cv_accuracy = np.mean(cv_scores)

cross_val_score: [0.83125 0.8625  0.85625 0.85    0.83125]


In [11]:
# Fitting the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

In [12]:
# Calculating evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

In [13]:
# Confusion Matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

In [14]:
# Print the evaluation metrics
print(f"Mean CV Accuracy: {mean_cv_accuracy * 100:.2f}%")
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-score: {f1 * 100:.2f}%")
print(f"Sensitivity: {sensitivity * 100:.2f}%")
print(f"Specificity: {specificity * 100:.2f}%")
print(f"AUC: {auc:.4f}")

Mean CV Accuracy: 84.62%
Test Accuracy: 83.50%
Precision: 80.91%
Recall: 88.12%
F1-score: 84.36%
Sensitivity: 88.12%
Specificity: 78.79%
AUC: 0.9134


In [15]:
filename = 'diabetes_model_RandomForestClassifier.sav'
pickle.dump(model, open(filename, 'wb'))

In [16]:
# Non diabetic patient sample data
input_data1 = (5,	121, 72, 23, 112, 26.2, 0.245, 30)
input_data2 = (6,96, 0, 0, 0, 23.7, 0.19, 28)
input_data3 = (0, 102, 78, 40, 90, 34.5, 0.238, 24)

In [17]:
# diabetic patient sample data
input_data4 = (1, 167, 74, 17, 144, 23.4, 0.447, 33)
input_data5 = (11, 136, 84, 35, 130, 28.3, 0.26, 42)
input_data6 = (2, 155, 52, 27, 540, 38.7, 0.24, 25)

In [19]:
sample_data_negative = [input_data1, input_data2, input_data3]
sample_data_positive = [input_data4, input_data5, input_data6]

for sample_data in sample_data_negative:
  # changing the input_data to numpy array
  input_data_as_numpy_array = np.asarray(sample_data)

  # reshape the array as we are predicting for one instance
  input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

  prediction = model.predict(input_data_reshaped)
  print(prediction)

  if (prediction[0] == 0):
    print('The person is not diabetic')
  else:
    print('The person is diabetic')

[0]
The person is not diabetic
[0]
The person is not diabetic
[0]
The person is not diabetic




In [21]:
# loading the saved model
loaded_model = pickle.load(open('diabetes_model_RandomForestClassifier.sav', 'rb'))

In [23]:
sample_data_negative = [input_data1, input_data2, input_data3]
sample_data_positive = [input_data4, input_data5, input_data6]

for sample_data in sample_data_negative:
  # changing the input_data to numpy array
  input_data_as_numpy_array = np.asarray(sample_data)

  # reshape the array as we are predicting for one instance
  input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

  prediction = loaded_model.predict(input_data_reshaped)
  print(prediction)

  if (prediction[0] == 0):
    print('The person is not diabetic')

  else:
    print('The person is diabetic')

[0]
The person is not diabetic
[0]
The person is not diabetic
[0]
The person is not diabetic


