### Imports

In [1]:
import numpy as np
import pandas as pd 
import os 
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC , LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier , AdaBoostClassifier ,ExtraTreesClassifier 
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import lightgbm as lgb
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import ReLU ,PReLU , LeakyReLU , ELU 
from keras.layers import Dropout
from keras.callbacks import EarlyStopping

import warnings
warnings.filterwarnings("ignore")

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
dataset = pd.read_csv('diabetes-dataset.zip')

Viewing dataset

In [3]:
def about_dataset(df):
    print('*******SHAPE*********')
    print('Shape of the dataset is : ',df.shape)
    print('*******Head*********')
    print(df.head())
    print('*******Tail*********')
    print(df.tail())
    print('*******Columns in the dataframe*********')  
    print(df.columns)
    print('*******About the dataset*********')
    df.info()
    print('*******Stats on the columns of the dataframe*********')
    print(df.describe())

In [4]:
about_dataset(dataset)

*******SHAPE*********
Shape of the dataset is :  (768, 9)
*******Head*********
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
*******Tail*********
     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
763           10      101             76             48      180  32.9   
764            2      122             70             27

Split train-test

In [5]:
X = dataset.drop(['Age','Outcome'],axis = 1 )
y = dataset['Outcome']
X_train , X_test , y_train , y_test = train_test_split(X, y, train_size = 0.7, random_state = 42)

Pipeline?

In [6]:
Final_dataset = pd.DataFrame({'model':[],'accuracy':[],'precision':[],'recall':[],'f1':[]})

#Creating a function 
def ML(model) :
        #Creating a Pipeline
        global Final_dataset
        pipe = Pipeline(
        [
        ('scaler',MinMaxScaler()),
        ('classifier',model)
        ]
        )
        # Fit the pipeline to the data 
        pipe.fit(X_train , y_train)
        # Make prediction
        y_pred = pipe.predict(X_test)   
       
        #Evaluate the pipeline on the test data 
        accuracy = accuracy_score(y_test , y_pred)
        precision = precision_score(y_test , y_pred)
        recall = recall_score(y_test , y_pred)
        f1 = f1_score(y_test , y_pred)
        model_name = type(model).__name__    # get the string name of the class
        print(model_name)
        # Appending it in the dataframe for models comparison
        new_row = {'model':model_name,'accuracy':accuracy , 'precision':precision,'recall':recall ,'f1': f1 }
        print(new_row)
        Final_dataset = pd.concat([Final_dataset , pd.DataFrame(new_row, index =[0])])     

In [7]:
Log_reg_model = LogisticRegression( )
SVC_Model = SVC()
GaussianNB_Model = GaussianNB()
RandomForest_Model = RandomForestClassifier()
GradientBoosting_Model = GradientBoostingClassifier()
AdaBoost_Model = AdaBoostClassifier()
ExtraTreesClassifier_Model = ExtraTreesClassifier()
XGBClassifier_Model = XGBClassifier()
LightGBM_Model = lgb.LGBMClassifier()

In [8]:
models = [Log_reg_model,SVC_Model,GaussianNB_Model,RandomForest_Model,GradientBoosting_Model,AdaBoost_Model,ExtraTreesClassifier_Model,XGBClassifier_Model,LightGBM_Model]
for model in models :
    ML(model)

LogisticRegression
{'model': 'LogisticRegression', 'accuracy': 0.7748917748917749, 'precision': 0.71875, 'recall': 0.575, 'f1': 0.6388888888888888}
SVC
{'model': 'SVC', 'accuracy': 0.7619047619047619, 'precision': 0.676056338028169, 'recall': 0.6, 'f1': 0.6357615894039735}
GaussianNB
{'model': 'GaussianNB', 'accuracy': 0.7662337662337663, 'precision': 0.6625, 'recall': 0.6625, 'f1': 0.6625}


RandomForestClassifier
{'model': 'RandomForestClassifier', 'accuracy': 0.7402597402597403, 'precision': 0.6351351351351351, 'recall': 0.5875, 'f1': 0.6103896103896104}
GradientBoostingClassifier
{'model': 'GradientBoostingClassifier', 'accuracy': 0.7316017316017316, 'precision': 0.6071428571428571, 'recall': 0.6375, 'f1': 0.6219512195121951}
AdaBoostClassifier
{'model': 'AdaBoostClassifier', 'accuracy': 0.7402597402597403, 'precision': 0.6282051282051282, 'recall': 0.6125, 'f1': 0.620253164556962}
ExtraTreesClassifier
{'model': 'ExtraTreesClassifier', 'accuracy': 0.7316017316017316, 'precision': 0.618421052631579, 'recall': 0.5875, 'f1': 0.6025641025641025}
XGBClassifier
{'model': 'XGBClassifier', 'accuracy': 0.7489177489177489, 'precision': 0.6341463414634146, 'recall': 0.65, 'f1': 0.6419753086419753}
[LightGBM] [Info] Number of positive: 188, number of negative: 349
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000466 seconds.
You can set `fo

In [9]:
Final_dataset

Unnamed: 0,model,accuracy,precision,recall,f1
0,LogisticRegression,0.774892,0.71875,0.575,0.638889
0,SVC,0.761905,0.676056,0.6,0.635762
0,GaussianNB,0.766234,0.6625,0.6625,0.6625
0,RandomForestClassifier,0.74026,0.635135,0.5875,0.61039
0,GradientBoostingClassifier,0.731602,0.607143,0.6375,0.621951
0,AdaBoostClassifier,0.74026,0.628205,0.6125,0.620253
0,ExtraTreesClassifier,0.731602,0.618421,0.5875,0.602564
0,XGBClassifier,0.748918,0.634146,0.65,0.641975
0,LGBMClassifier,0.714286,0.581395,0.625,0.60241


Create a neural network using Tensor Flow

In [10]:
mmc = MinMaxScaler()
X_train = mmc.fit_transform(X_train)
X_test = mmc.transform(X_test)

In [17]:
# Creating the model

early_stopping = EarlyStopping(monitor = 'accuracy' , patience = 15, restore_best_weights = True )

# Create a model using the Sequential API
model = tf.keras.Sequential([
    tf.keras.layers.Dense(500,activation = 'relu'),
    tf.keras.layers.Dense(500,activation = 'relu'),
    tf.keras.layers.Dense(500,activation = 'relu'),
    tf.keras.layers.Dense(500,activation = 'relu'),
    tf.keras.layers.Dense(100,activation = 'relu'),
    tf.keras.layers.Dense(1,activation = 'sigmoid')]
    )


# Compiling the model
model.compile(loss = 'binary_crossentropy' , optimizer= tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])
model.history = model.fit(X_train , y_train , epochs = 500 ,callbacks = [early_stopping])

Epoch 1/500
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 23ms/step - accuracy: 0.6260 - loss: 0.9555
Epoch 2/500
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.6654 - loss: 0.6417
Epoch 3/500
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.6296 - loss: 0.6503
Epoch 4/500
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.7199 - loss: 0.5741
Epoch 5/500
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7121 - loss: 0.5602
Epoch 6/500
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7642 - loss: 0.5123
Epoch 7/500
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7624 - loss: 0.5265
Epoch 8/500
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.6894 - loss: 0.5611
Epoch 9/500
[1m17/17[0m [32m━━━━━━━━━

In [18]:
y_pred = model.predict(X_test)
y_pred = (y_pred >= 0.5).astype(int)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step


In [19]:
accuracy = accuracy_score(y_test , y_pred)
precision = precision_score(y_test , y_pred)
recall = recall_score(y_test , y_pred)
f1 = f1_score(y_test , y_pred)


new_row = {'model':'ANN(5-Hidden)(0.5Cutoff)','accuracy':accuracy , 'precision':precision,'recall':recall ,'f1': f1 }
print(new_row)
Final_dataset = pd.concat([Final_dataset , pd.DataFrame(new_row, index =[0])]) 

{'model': 'ANN(5-Hidden)(0.5Cutoff)', 'accuracy': 0.6883116883116883, 'precision': 0.5317460317460317, 'recall': 0.8375, 'f1': 0.6504854368932039}


In [20]:
y_pred = model.predict(X_test)
y_pred = (y_pred >= 0.7).astype(int)
accuracy = accuracy_score(y_test , y_pred)
precision = precision_score(y_test , y_pred)
recall = recall_score(y_test , y_pred)
f1 = f1_score(y_test , y_pred)


new_row = {'model':'ANN(5-Hidden)(0.7Cutoff)','accuracy':accuracy , 'precision':precision,'recall':recall ,'f1': f1 }
print(new_row)
Final_dataset = pd.concat([Final_dataset , pd.DataFrame(new_row, index =[0])]) 

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
{'model': 'ANN(5-Hidden)(0.7Cutoff)', 'accuracy': 0.7359307359307359, 'precision': 0.6144578313253012, 'recall': 0.6375, 'f1': 0.6257668711656442}


In [21]:
Final_models_summary = Final_dataset

In [22]:
Final_models_summary

Unnamed: 0,model,accuracy,precision,recall,f1
0,LogisticRegression,0.774892,0.71875,0.575,0.638889
0,SVC,0.761905,0.676056,0.6,0.635762
0,GaussianNB,0.766234,0.6625,0.6625,0.6625
0,RandomForestClassifier,0.74026,0.635135,0.5875,0.61039
0,GradientBoostingClassifier,0.731602,0.607143,0.6375,0.621951
0,AdaBoostClassifier,0.74026,0.628205,0.6125,0.620253
0,ExtraTreesClassifier,0.731602,0.618421,0.5875,0.602564
0,XGBClassifier,0.748918,0.634146,0.65,0.641975
0,LGBMClassifier,0.714286,0.581395,0.625,0.60241
0,ANN(5-Hidden)(0.5Cutoff),0.735931,0.646154,0.525,0.57931
