In [None]:
# 1. Import data
import pandas as pd
Data = pd.read_csv('/content/diabetes.csv')

In [None]:
# 2. Drop the rows that are not needed (have missing status values) as sensitive data so we can't fill it with mean or mode
Data.dropna(subset=['Age'], inplace=True)
Data.dropna(subset=['Gender'], inplace=True)
Data.dropna(subset=['BMI'], inplace=True)
Data.dropna(subset=['SBP'], inplace=True)
Data.dropna(subset=['DBP'], inplace=True)
Data.dropna(subset=['FPG'], inplace=True)
Data.dropna(subset=['Chol'], inplace=True)
Data.dropna(subset=['Tri'], inplace=True)
Data.dropna(subset=['HDL'], inplace=True)
Data.dropna(subset=['LDL'], inplace=True)
Data.dropna(subset=['ALT'], inplace=True)
Data.dropna(subset=['BUN'], inplace=True)
Data.dropna(subset=['CCR'], inplace=True)
Data.dropna(subset=['FFPG'], inplace=True)
Data.dropna(subset=['Smoking'], inplace=True)
Data.dropna(subset=['Drinking'], inplace=True)
Data.dropna(subset=['Family_histroy'], inplace=True)
Data.dropna(subset=['Diabetes'], inplace=True)

In [None]:
# 3. Convert necessery column to numeric
Data['Age'] = pd.to_numeric(Data['Age'], errors='coerce')
Data['Gender'] = pd.to_numeric(Data['Gender'], errors='coerce')
Data['BMI'] = pd.to_numeric(Data['BMI'], errors='coerce')
Data['SBP'] = pd.to_numeric(Data['SBP'], errors='coerce')
Data['DBP'] = pd.to_numeric(Data['DBP'], errors='coerce')
Data['FPG'] = pd.to_numeric(Data['FPG'], errors='coerce')
Data['Chol'] = pd.to_numeric(Data['Chol'], errors='coerce')
Data['Tri'] = pd.to_numeric(Data['Tri'], errors='coerce')
Data['HDL'] = pd.to_numeric(Data['HDL'], errors='coerce')
Data['LDL'] = pd.to_numeric(Data['LDL'], errors='coerce')
Data['ALT'] = pd.to_numeric(Data['ALT'], errors='coerce')
Data['BUN'] = pd.to_numeric(Data['BUN'], errors='coerce')
Data['CCR'] = pd.to_numeric(Data['CCR'], errors='coerce')
Data['FFPG'] = pd.to_numeric(Data['FFPG'], errors='coerce')
Data['Smoking'] = pd.to_numeric(Data['Smoking'], errors='coerce')
Data['Drinking'] = pd.to_numeric(Data['Drinking'], errors='coerce')
Data['Family_histroy'] = pd.to_numeric(Data['Family_histroy'], errors='coerce')
Data['Diabetes'] = pd.to_numeric(Data['Diabetes'], errors='coerce')


In [None]:
# 4. Drop the duplicate rows
Data.drop_duplicates(inplace=True)


In [None]:
# 5. Information of Dataset
Data.info()


In [None]:
# 6. Data describe
Data.describe()

In [None]:
# 7. Data Groupping
print(Data.groupby('Diabetes').size())

In [None]:
# 8. let's visualise the number of samples for each class with count plot
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(x='Diabetes', data=Data)
plt.title('Number of samples for each class')
plt.show()

In [None]:
# 9. Calculate the correlation matrix
from matplotlib import pyplot
corr_matrix = Data.corr()
fig, ax = pyplot.subplots(figsize=(30, 20))
sns.heatmap(corr_matrix, cmap='coolwarm', annot=True, ax=ax)
ax.set_title('Correlation Matrix')
pyplot.show()

In [None]:
# 10. Feature matrix
X = Data[['Age', 'Gender', 'BMI', 'SBP','DBP','FPG','Chol', 'Tri','HDL', 'LDL','ALT','BUN','CCR', 'FFPG','Smoking','Drinking', 'Family_histroy']]
print("----------------- Feature Matrix -----------------")
print(X.head())

In [None]:
# 11. Target Matrix
y = Data['Diabetes']
print("----------------- Target Matrix -----------------")
print(y.head())

In [None]:
# 12. Model Training and Testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.28, random_state=16)
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

In [None]:
# 13. Define classifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
model_svm = svm.SVC()
model_nb = GaussianNB()
model_knn = KNeighborsClassifier()
model_lr = LogisticRegression()
model_dt = DecisionTreeClassifier()


In [None]:
# 14. Train using this classifier
model_svm.fit(X_train, y_train)
model_nb.fit(X_train, y_train)
model_knn.fit(X_train, y_train)
model_lr.fit(X_train, y_train)
model_dt.fit(X_train, y_train)

In [None]:
# 15. Test Classifier
from sklearn import metrics
score = {}
y_pred_svm = model_svm.predict(X_test)
score_svm = metrics.accuracy_score(y_test, y_pred_svm).round(5)
score["SVM"] = score_svm

y_pred_nb = model_nb.predict(X_test)
score_nb = metrics.accuracy_score(y_test, y_pred_nb).round(5)
score["Naive Bayes"] = score_nb

y_pred_knn = model_knn.predict(X_test)
score_knn = metrics.accuracy_score(y_test, y_pred_knn).round(5)
score["KNN"] = score_knn

y_pred_lr = model_lr.predict(X_test)
score_lr = metrics.accuracy_score(y_test, y_pred_lr).round(5)
score["Logistic Regression"] = score_lr

y_pred_dt = model_dt.predict(X_test)
score_dt = metrics.accuracy_score(y_test, y_pred_dt).round(5)
score["Decision Tree"] = score_dt

In [None]:
# 16. Predict Classifier Accuracy
y_pred_svm = model_svm.predict(X_test)
score_svm = metrics.accuracy_score(y_test, y_pred_svm).round(5)
score["SVM"] = score_svm

y_pred_nb = model_nb.predict(X_test)
score_nb = metrics.accuracy_score(y_test, y_pred_nb).round(5)
score["Naive Bayes"] = score_nb

y_pred_knn = model_knn.predict(X_test)
score_knn = metrics.accuracy_score(y_test, y_pred_knn).round(5)
score["KNN"] = score_knn

y_pred_lr = model_lr.predict(X_test)
score_lr = metrics.accuracy_score(y_test, y_pred_lr).round(5)
score["Logistic Regression"] = score_lr

y_pred_dt = model_dt.predict(X_test)
score_dt = metrics.accuracy_score(y_test, y_pred_dt).round(5)
score["Decision Tree"] = score_dt

In [None]:
# 17. Fianl Accuracy Result
data = {
    "Models": ['SVM','Naive Bayes','KNN','Logistic Regression','Decision Tree'],
    "Accuracy(%)": [score_svm*100,score_nb*100,score_knn*100,score_lr*100,score_dt*100]
}
score_df = pd.DataFrame(data)
print(score_df)

In [None]:
# 18. Best Accuracy algorithm
max=0
Bkey=''
Bvalue=0
for key, value in score.items():
    #print(key, ":", value)
    if(value>max):
      max=value
      Bkey=key
      Bvalue=value
print("Best Accuracy Algorithm For Model is",Bkey," =",Bvalue*100," %")


In [None]:
# 19. Calculate Confusion Matrix for SVM
from sklearn.metrics import confusion_matrix

model_svm.fit(X_train, y_train)
predictions_svm = model_svm.predict(X_test)

conf_matrix = confusion_matrix(y_test, predictions_svm)
tn, fp, fn, tp = conf_matrix.ravel()

print(f"True Positives (TP): {tp}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"True Negatives (TN): {tn}")
print("Total Instance: ", tp+tn+fp+fn)


In [None]:
# 20. Calculate Confusion Matrix for NB
model_nb.fit(X_train, y_train)
predictions_nb = model_nb.predict(X_test)

conf_matrix = confusion_matrix(y_test, predictions_nb)
tn, fp, fn, tp = conf_matrix.ravel()

print(f"True Positives (TP): {tp}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"True Negatives (TN): {tn}")
print("Total Instance: ", tp+tn+fp+fn)

In [None]:
# 21. Calculate Confusion Matrix for KNN
model_knn.fit(X_train, y_train)
predictions_knn = model_knn.predict(X_test)

conf_matrix = confusion_matrix(y_test, predictions_knn)
tn, fp, fn, tp = conf_matrix.ravel()

print(f"True Positives (TP): {tp}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"True Negatives (TN): {tn}")
print("Total Instance: ", tp+tn+fp+fn)

In [None]:
# 22. Calculate Confusion Matrix for DT
model_dt.fit(X_train, y_train)
predictions_dt = model_dt.predict(X_test)

conf_matrix = confusion_matrix(y_test, predictions_dt)
tn, fp, fn, tp = conf_matrix.ravel()

print(f"True Positives (TP): {tp}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"True Negatives (TN): {tn}")
print("Total Instance: ", tp+tn+fp+fn)

In [None]:
# 23. Calculate Confusion Matrix for LR
model_lr.fit(X_train, y_train)
predictions_lr = model_lr.predict(X_test)

conf_matrix = confusion_matrix(y_test, predictions_lr)
tn, fp, fn, tp = conf_matrix.ravel()

print(f"True Positives (TP): {tp}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"True Negatives (TN): {tn}")
print("Total Instance: ", tp+tn+fp+fn)

In [None]:
# 24. Accuracy, Precision, Recall, F-Measure
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
import matplotlib.pyplot as plt

accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f_measure_svm = f1_score(y_test, y_pred_svm)

accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb)
recall_nb = recall_score(y_test, y_pred_nb)
f_measure_nb = f1_score(y_test, y_pred_nb)

accuracy_knn = accuracy_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)
f_measure_knn = f1_score(y_test, y_pred_knn)

accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
f_measure_lr = f1_score(y_test, y_pred_lr)

accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
f_measure_dt = f1_score(y_test, y_pred_dt)

print("------SVM--------")
print(f"Accuracy: {accuracy_svm:.4f}")
print(f"Precision: {precision_svm:.4f}")
print(f"Recall: {recall_svm:.4f}")
print(f"F-Measure: {f_measure_svm:.4f}")

print("------NB--------")
print(f"Accuracy: {accuracy_nb:.4f}")
print(f"Precision: {precision_nb:.4f}")
print(f"Recall: {recall_nb:.4f}")
print(f"F-Measure: {f_measure_nb:.4f}")

print("------KNN--------")
print(f"Accuracy: {accuracy_knn:.4f}")
print(f"Precision: {precision_knn:.4f}")
print(f"Recall: {recall_knn:.4f}")
print(f"F-Measure: {f_measure_knn:.4f}")

print("------LR--------")
print(f"Accuracy: {accuracy_lr:.4f}")
print(f"Precision: {precision_lr:.4f}")
print(f"Recall: {recall_lr:.4f}")
print(f"F-Measure: {f_measure_lr:.4f}")

print("------DT--------")
print(f"Accuracy: {accuracy_dt:.4f}")
print(f"Precision: {precision_dt:.4f}")
print(f"Recall: {recall_dt:.4f}")
print(f"F-Measure: {f_measure_dt:.4f}")

In [None]:
# 25.Calculate ROC Curve
fpr1, tpr1, _ = roc_curve(y_test, y_pred_svm)  # Thresholds (ignored)
roc_auc1 = auc(fpr1, tpr1)
fpr2, tpr2, _ = roc_curve(y_test, y_pred_nb)
roc_auc2 = auc(fpr2, tpr2)
fpr3, tpr3, _ = roc_curve(y_test, y_pred_knn)
roc_auc3 = auc(fpr3, tpr3)
fpr4, tpr4, _ = roc_curve(y_test, y_pred_dt)
roc_auc4 = auc(fpr4, tpr4)
fpr5, tpr5, _ = roc_curve(y_test, y_pred_lr)
roc_auc5 = auc(fpr5, tpr5)

In [None]:
# 26. Plot ROC Curve SVM
plt.figure()
plt.plot(fpr1, tpr1, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc1)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

In [None]:
# 27. Plot ROC Curve NB
plt.figure()
plt.plot(fpr2, tpr2, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc2)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

In [None]:
# 28. Plot ROC Curve KNN
plt.figure()
plt.plot(fpr3, tpr3, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc3)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

In [None]:
# 29. Plot ROC Curve DT
plt.figure()
plt.plot(fpr4, tpr4, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc4)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

In [None]:
# 30. Plot ROC Curve LR
plt.figure()
plt.plot(fpr5, tpr5, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc5)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# 31. Chart
data = {'Algorithm': ['NB', 'SVM', 'DT', 'KNN', 'LR'],
        'Precision': [0.9140, 0.8152, 0.8920, 0.6715, 0.9443],
        'Recall': [0.886, 0.688, 0.8969, 0.5181, 0.8496],
        'F-Measure': [0.9011, 0.7462, 0.8944, 0.5849, 0.8944],
        'Accuracy%': [0.9419, 0.8606, 0.9369, 0.7809, 0.9402]}

plt.bar(data['Algorithm'], data['Precision'])
plt.xlabel('Classifiers')
plt.ylabel('Range')
plt.title('Precision')
plt.show()
print()

plt.bar(data['Algorithm'], data['Recall'])
plt.xlabel('Classifiers')
plt.ylabel('Range')
plt.title('Recall')
plt.show()
print()
plt.bar(data['Algorithm'], data['F-Measure'])
plt.xlabel('Classifiers')
plt.ylabel('Range')
plt.title('F-Measure')
plt.show()
print()
plt.bar(data['Algorithm'], data['Accuracy%'])
plt.xlabel('Classifiers')
plt.ylabel('Range')
plt.title('Accuracy%')
plt.show()