In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [6]:
categorical_val = []
continuous_val = []

for column in df.columns:
    if len(df[column].unique()) <= 10:
        categorical_val.append(column)
    else:
        continuous_val.append(column)

In [7]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [8]:
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

for column in feature_columns:
    print(f"{column} ==> Missing Value : {len(df.loc[df[column] == 0])}")

Pregnancies ==> Missing Value : 111
Glucose ==> Missing Value : 5
BloodPressure ==> Missing Value : 35
SkinThickness ==> Missing Value : 227
Insulin ==> Missing Value : 374
BMI ==> Missing Value : 11
DiabetesPedigreeFunction ==> Missing Value : 0
Age ==> Missing Value : 0


In [9]:
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values=0, strategy='mean', copy=False)

df[feature_columns] = fill_values.fit_transform(df[feature_columns])

for column in feature_columns:
    print(f"{column} ==> Missing Value : {len(df.loc[df[column] == 0])}")

Pregnancies ==> Missing Value : 0
Glucose ==> Missing Value : 0
BloodPressure ==> Missing Value : 0
SkinThickness ==> Missing Value : 0
Insulin ==> Missing Value : 0
BMI ==> Missing Value : 0
DiabetesPedigreeFunction ==> Missing Value : 0
Age ==> Missing Value : 0


In [10]:
from sklearn.model_selection import train_test_split

X = df[feature_columns]
y = df.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

def evaluate(model, X_test, y_test):
    y_test_pred = model.predict(X_test)
    # y_train_pred = model.predict(X_train)

    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")
    print(f"Accuracy Score:\n{accuracy_score(y_test, y_test_pred)}")
    clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True))
    print(f"Classification Report:\n{clf_report}")

In [24]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
bagging_clf = BaggingClassifier(estimator=tree, n_estimators=1500)
bagging_clf.fit(X_train, y_train)

evaluate(bagging_clf, X_test, y_test)

Confusion Matrix:
[[118  33]
 [ 25  55]]
Accuracy Score:
0.7489177489177489
Classification Report:
                    0          1  accuracy   macro avg  weighted avg
precision    0.825175   0.625000  0.748918    0.725087      0.755850
recall       0.781457   0.687500  0.748918    0.734478      0.748918
f1-score     0.802721   0.654762  0.748918    0.728741      0.751480
support    151.000000  80.000000  0.748918  231.000000    231.000000


In [25]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=1500)
rf_clf.fit(X_train, y_train)

evaluate(rf_clf, X_test, y_test)

Confusion Matrix:
[[123  28]
 [ 27  53]]
Accuracy Score:
0.7619047619047619
Classification Report:
                    0          1  accuracy   macro avg  weighted avg
precision    0.820000   0.654321  0.761905    0.737160      0.762622
recall       0.814570   0.662500  0.761905    0.738535      0.761905
f1-score     0.817276   0.658385  0.761905    0.737830      0.762249
support    151.000000  80.000000  0.761905  231.000000    231.000000


In [26]:
from sklearn.ensemble import AdaBoostClassifier

boost_clf = AdaBoostClassifier(n_estimators=30)
boost_clf.fit(X_train, y_train)

evaluate(boost_clf, X_test, y_test)

Confusion Matrix:
[[123  28]
 [ 27  53]]
Accuracy Score:
0.7619047619047619
Classification Report:
                    0          1  accuracy   macro avg  weighted avg
precision    0.820000   0.654321  0.761905    0.737160      0.762622
recall       0.814570   0.662500  0.761905    0.738535      0.761905
f1-score     0.817276   0.658385  0.761905    0.737830      0.762249
support    151.000000  80.000000  0.761905  231.000000    231.000000




In [28]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

estimator = []

log_reg = LogisticRegression(solver='liblinear')
estimator.append(('Logistic', log_reg))

svm_clf = SVC(gamma='scale')
estimator.append(('SVM', svm_clf))

voting = VotingClassifier(estimators=estimator)
voting.fit(X_train, y_train)

evaluate(voting, X_test, y_test)

Confusion Matrix:
[[134  17]
 [ 42  38]]
Accuracy Score:
0.7445887445887446
Classification Report:
                    0          1  accuracy   macro avg  weighted avg
precision    0.761364   0.690909  0.744589    0.726136      0.736964
recall       0.887417   0.475000  0.744589    0.681209      0.744589
f1-score     0.819572   0.562963  0.744589    0.691267      0.730703
support    151.000000  80.000000  0.744589  231.000000    231.000000
