## Importing libraries 

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

## Reading the Dataset

In [2]:
data = pd.read_csv('diabetes.csv')
df = data.copy()

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df.tail() # as you can see in column Insulin some values are 0. So, we need to feel them with that columns means

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
752,3,108,62,24,0,26.0,0.223,25,0
753,0,181,88,44,510,43.3,0.222,26,1
754,8,154,78,32,0,32.4,0.443,45,1
755,1,128,88,39,110,36.5,1.057,37,1
756,7,137,90,41,0,32.0,0.391,39,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 757 entries, 0 to 756
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               757 non-null    int64  
 1   Glucose                   757 non-null    int64  
 2   BloodPressure             757 non-null    int64  
 3   SkinThickness             757 non-null    int64  
 4   Insulin                   757 non-null    int64  
 5   BMI                       757 non-null    float64
 6   DiabetesPedigreeFunction  757 non-null    float64
 7   Age                       757 non-null    int64  
 8   Outcome                   757 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 53.4 KB


In [6]:
df.shape

(757, 9)

Checking for null values

In [7]:
df.isnull().sum() 

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Checkin for how many 0 values Insulin column has

In [8]:
df['Insulin'].value_counts() 

0      366
105     11
130      9
140      9
120      8
      ... 
73       1
171      1
255      1
52       1
510      1
Name: Insulin, Length: 184, dtype: int64

Replacing 0 values the colums mean

In [9]:
in_mean = round(df['Insulin'].mean(),1) # getting mean value of insulin
df.Insulin = df.Insulin.replace({0:in_mean})  # replacing zero values to column mean 
df['Insulin'].value_counts() 

80.6     366
105.0     11
130.0      9
140.0      9
120.0      8
        ... 
73.0       1
171.0      1
255.0      1
52.0       1
510.0      1
Name: Insulin, Length: 184, dtype: int64

Changing data type of the insulin to as before to integer 

In [29]:
df['Insulin'] = df['Insulin'].astype('int64')
df['Insulin']

0       80
1       80
2       80
3       94
4      168
      ... 
752     80
753    510
754     80
755    110
756     80
Name: Insulin, Length: 757, dtype: int64

Dropping ALL duplicate values in case 

In [14]:
df.drop_duplicates(keep = False, inplace = True)

Balancing the target data 
Tried train the model without balancing the target but the results were not satisfying 

In [17]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler 
from collections import Counter 


y = df['Outcome']
X = df.drop(['Outcome'],axis=1)

# instantiating the random over sampLer 
ros = RandomOverSampler()
 # resampLing X, y x_ros, y_
X_ros, y_ros = ros.fit_resample(X, y) 
# new class distribution
print(Counter(y_ros))

Counter({1: 493, 0: 493})


Training dataset to different Classification Models

Splitting data to train and test 

In [18]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X_ros,y_ros, test_size = 0.20, random_state = 101)

# shapes must be the same 
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(788, 8) (198, 8)
(788,) (198,)


KNeighborsClassifier

In [19]:
# Scaling the data and target for KNN to get better result. Using MinMax scaling method for this.

from sklearn.preprocessing import MinMaxScaler

scaling = MinMaxScaler()
X_train_minmax = scaling.fit_transform(X_train)
X_test_minmax = scaling.fit_transform(X_test)

In [20]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,confusion_matrix 
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train_minmax, y_train)
knn_pred = knn.predict(X_test_minmax)

# Evaluating Model
print(accuracy_score(y_test, knn_pred))
print(confusion_matrix(y_test, knn_pred))
print(classification_report(y_test, knn_pred))


0.696969696969697
[[67 34]
 [26 71]]
              precision    recall  f1-score   support

           0       0.72      0.66      0.69       101
           1       0.68      0.73      0.70        97

    accuracy                           0.70       198
   macro avg       0.70      0.70      0.70       198
weighted avg       0.70      0.70      0.70       198



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


DecisionTreeClassifier

In [21]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
Dtree_pred = dt.predict(X_test)




print(accuracy_score(y_test, Dtree_pred))
print(confusion_matrix(y_test, Dtree_pred))
print(classification_report(y_test, Dtree_pred))


0.8333333333333334
[[85 16]
 [17 80]]
              precision    recall  f1-score   support

           0       0.83      0.84      0.84       101
           1       0.83      0.82      0.83        97

    accuracy                           0.83       198
   macro avg       0.83      0.83      0.83       198
weighted avg       0.83      0.83      0.83       198



Support Vector Machine

In [22]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

model = SVC()
model.fit(X_train,y_train)
svm_pred = model.predict(X_test)

# Evaluating Model
print(accuracy_score(y_test, svm_pred).round(3))
print(confusion_matrix(y_test, svm_pred))
print(classification_report(y_test, svm_pred))

0.717
[[72 29]
 [27 70]]
              precision    recall  f1-score   support

           0       0.73      0.71      0.72       101
           1       0.71      0.72      0.71        97

    accuracy                           0.72       198
   macro avg       0.72      0.72      0.72       198
weighted avg       0.72      0.72      0.72       198



Random forest 

In [23]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=14, random_state=101)
clf.fit(X_train, y_train)
rf = clf.predict(X_test)

# Evaluating Model
print(accuracy_score(y_test, rf))
print(confusion_matrix(y_test, rf))
print(classification_report(y_test, rf))

0.8434343434343434
[[84 17]
 [14 83]]
              precision    recall  f1-score   support

           0       0.86      0.83      0.84       101
           1       0.83      0.86      0.84        97

    accuracy                           0.84       198
   macro avg       0.84      0.84      0.84       198
weighted avg       0.84      0.84      0.84       198



In [24]:

from sklearn.model_selection import cross_val_score

score = cross_val_score(clf, X_train, y_train, cv=100)
score.mean()


0.8585714285714287

For the test data 


 Reading dataset

In [25]:
new_data = pd.read_csv('test.csv')
df2 = new_data.copy()

In [26]:
df2.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,123,72,0,0,36.3,0.258,52,1
1,1,106,76,0,0,37.5,0.197,26,0
2,6,190,92,0,0,35.5,0.278,66,1
3,2,88,58,26,16,28.4,0.766,22,0
4,9,170,74,31,0,44.0,0.403,43,1


In [27]:
df2.shape

(11, 9)

Data preparation

In [28]:
df2['Insulin'].value_counts() # 8 rows have 0 values so these need to be replaced with its meand/median

0      8
16     1
180    1
112    1
Name: Insulin, dtype: int64

In [30]:

df2['Insulin'] = df2['Insulin'].replace({0:80})  # replacing zero values to column mean 
df2['Insulin'].value_counts() 

80     8
16     1
180    1
112    1
Name: Insulin, dtype: int64

Splitting dataset to target and test

In [31]:
new_y = df2['Outcome']
new_X = df2.drop(["Outcome"], axis=1)

 Random Forest

In [32]:
new_rf = clf.predict(new_X)

# Evaluating Model
print(accuracy_score(new_y, new_rf))
print(confusion_matrix(new_y, new_rf))
print(classification_report(new_y, new_rf))

1.0
[[7 0]
 [0 4]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00         4

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

