In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as stats
import warnings
warnings.simplefilter("ignore")

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn import set_config
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score,precision_score,recall_score,roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedStratifiedKFold
import xgboost as xgboost
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from tpot import TPOTClassifier
from sklearn.ensemble import StackingClassifier


In [3]:
df=pd.read_csv('diabetes.csv')

In [4]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [5]:
df['Glucose']=np.where(df['Glucose']==0,df['Glucose'].median(),df['Glucose'])
df['Insulin']=np.where(df['Insulin']==0,df['Insulin'].median(),df['Insulin'])
df['SkinThickness']=np.where(df['SkinThickness']==0,df['SkinThickness'].median(),df['SkinThickness'])



In [6]:
X=df.drop(columns=['Outcome'])
y=df['Outcome']


In [7]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

In [35]:
models={
    'Logistic Regression':LogisticRegression(),
    'Decision Tree':DecisionTreeClassifier(),
    'Random Forest':RandomForestClassifier(),
    'Xg Boost':XGBClassifier()
}
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)
    
    model_train_accuracy=accuracy_score(y_train,y_train_pred)
    model_train_f1=f1_score(y_train,y_train_pred, average='weighted')
    model_train_precison=precision_score(y_train,y_train_pred)
    model_train_recall=recall_score(y_train,y_train_pred)
    model_train_rocauc_score=roc_auc_score(y_train,y_train_pred)
    model_train_confusion_metrics=confusion_matrix(y_train,y_train_pred)
                                                  
    model_test_accuracy=accuracy_score(y_test,y_test_pred)
    model_test_f1=f1_score(y_test,y_test_pred, average='weighted')
    model_test_precison=precision_score(y_test,y_test_pred)
    model_test_recall=recall_score(y_test,y_test_pred)
    model_test_rocauc_score=roc_auc_score(y_test,y_test_pred)
    model_test_confusion_metrics=confusion_matrix(y_test,y_test_pred)
                           
    print(list(models.keys())[i])
    
    print('Model Performance for Training set')
    
    print('-Accuracy:{:.4f}'.format(model_train_accuracy))
    print('-f1_score:{:.4f}'.format(model_train_f1))                       
    print('-precision_score:{:.4f}'.format(model_train_precison))                       
    print('-roc_auc_score:{:.4f}'.format(model_train_rocauc_score))
    
    
    print('---------------------------------------')  
    
    print('Model Performance for Testing set')
    print('-Accuracy:{:.4f}'.format(model_test_accuracy))
    print('-f1_score:{:.4f}'.format(model_test_f1))                       
    print('-precision_score:{:.4f}'.format(model_test_precison))                       
    print('-roc_auc_score:{:.4f}'.format(model_test_rocauc_score))
         
                           
    print('='*35)
    print('\n')
    
    
    
                           
                           
                           
                           

Logistic Regression
Model Performance for Training set
-Accuracy:0.7803
-f1_score:0.7735
-precision_score:0.7192
-roc_auc_score:0.7307
---------------------------------------
Model Performance for Testing set
-Accuracy:0.7792
-f1_score:0.7656
-precision_score:0.8182
-roc_auc_score:0.7271


Decision Tree
Model Performance for Training set
-Accuracy:1.0000
-f1_score:1.0000
-precision_score:1.0000
-roc_auc_score:1.0000
---------------------------------------
Model Performance for Testing set
-Accuracy:0.7359
-f1_score:0.7308
-precision_score:0.6712
-roc_auc_score:0.7021


Random Forest
Model Performance for Training set
-Accuracy:1.0000
-f1_score:1.0000
-precision_score:1.0000
-roc_auc_score:1.0000
---------------------------------------
Model Performance for Testing set
-Accuracy:0.7446
-f1_score:0.7257
-precision_score:0.7647
-roc_auc_score:0.6854


Xg Boost
Model Performance for Training set
-Accuracy:1.0000
-f1_score:1.0000
-precision_score:1.0000
-roc_auc_score:1.0000
---------------