# Classification 

# Importing Libraries

All the neccesary Python libraries are imported

In [None]:
#Importing Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr
from scipy.stats import kendalltau
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import datasets, linear_model, metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn import svm
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

# Loading Dataset

The Diabetes Dataset which is in the form of a csv file is loaded into 'df' as a Dataframe

In [None]:
#Reading the dataset
df=pd.read_csv('https://raw.githubusercontent.com/SravaniRVS/DATA602/main/Project/Dataset.csv')

Top 5 rows of the Dataframe are displayed

In [None]:
#Displaying top 5 rows in the dataset
df.head(5)

# Exploratory Data Analysis

Gender wise distribution of class - We can see that there is some anomaly in the data distribution with duplicate values

In [None]:
#Output Classes Frequency plot
df_gb = df.groupby(['Gender','CLASS']).size().unstack(level=1)
df_gb.plot(kind = 'bar')

Check for the count of null values in the dataset

In [None]:
#Check for missing values in dataset
df.isna().sum()

In [None]:
df.info() #Information about the dataset

Displayig the data balance

In [None]:
data = df['CLASS'].value_counts().to_dict()
plt.figure(figsize=(7,7))
plt.pie(data.values(), labels = ['Y', 'N', 'P' ,'Y' ,'N'], 
        wedgeprops={'edgecolor': 'black'},
        colors = ['#008fd5', '#fc4f30'],
        autopct='%.0f%%', shadow=True)
plt.title("CLASS", fontsize=25)
plt.show()

In [None]:
df['Gender'].value_counts()

In [None]:
data1 = df['Gender'].value_counts().to_dict()
plt.figure(figsize=(7,7))
plt.pie(data1.values(), labels = ['M', 'F', 'f'], 
        wedgeprops={'edgecolor': 'black'},
        colors = ['#008fd5', '#fc4f30'],
        autopct='%.0f%%', shadow=True)
plt.title("Gender", fontsize=25)
plt.show()

This clearly shows that this data is very imbalanced data

Displaying a pairplot to find out the correlation between the various features of the dataset

In [None]:
#Displaying pairplot that shows relationship among any two variables
sns.pairplot(df)

Correlation Heatmap Plot

In [None]:
#Displaying Correlation heatmap which shows the extent of correlation between predictors

plt.figure(figsize=(10,6))
plot = sns.heatmap(df.corr().round(2), cmap="YlGnBu", annot=True)

In [None]:
df.hist(figsize=(12,12))
plt.show()

Calculating number of unique values to understand the significance of that column

In [None]:
#Number of unique values in ID
df.ID.unique().size

In [None]:
#Number of unique values in No_Pation
df.No_Pation.unique().size

Check for redundant data

In [None]:
#Check for the unique values in CLASS
df['CLASS'].unique()

In [None]:
#Check for the unique values in Gender
df['Gender'].unique()

# Feature Engineering

Replacing duplicate values in CLASS and Gender column

In [None]:
#String maniplulation for CLASS Column
df['CLASS'] = df['CLASS'].str.replace(" ", "")

In [None]:
df['Gender'] = df['Gender'].str.replace("f", "F")

In [None]:
df_gb1 = df.groupby(['Gender','CLASS']).size().unstack(level=1)
df_gb1.plot(kind = 'bar')

Dropping ID and No_Pation as they are irrelevant and unique for all patients

In [None]:
#Drop ID and No_Pation column
df.drop(['ID','No_Pation'],axis=1,inplace=True)

In [None]:
df.head(5)

In [None]:
plt.figure(figsize=(18,4))

plt.subplot(131)
sns.barplot(x= 'CLASS',y='BMI',data = df,estimator = np.mean)
plt.title("Average BMI levels based on CLASS", fontweight='bold')

plt.subplot(132)
sns.barplot(x= 'CLASS',y='Urea',data = df,estimator = np.mean)
plt.title("Average Urea levels based on CLASS", fontweight='bold')

plt.subplot(133)
sns.barplot(x= 'CLASS',y='Cr',data = df,estimator = np.mean)
plt.title("Average Cr levels based on CLASS", fontweight='bold')

Label Encoding Gender and CLASS columns which have categorical values

In [None]:
#Label Encoding the Gender and CLASS columns
le=LabelEncoder()
df['Gender']=le.fit_transform(df['Gender'])
df['CLASS']=le.fit_transform(df['CLASS'])

In [None]:
df.head(5)

In [None]:
_y_imbalance = df['CLASS'].value_counts()/len(df)*100  # % of each class in the dataset

_y_imbalance

In [None]:
dict_y = { '2':int( _y_imbalance[0]),
          '0': int(_y_imbalance[1]),
          '1': int(_y_imbalance[2]),
    
}

In [None]:
type_ = list(dict_y.keys())
percentage = list(dict_y.values())
fig = plt.figure(figsize = (10, 5))
# creating the bar plot
plt.bar(type_, percentage, color ='maroon',
        width = 0.4)
plt.xlabel("Class")
plt.ylabel("%age of each class")
plt.title("Value Count of each Class in (% age)")
plt.show()

- The bar graph shows the imbalance in the dataset. Majority of the values are of class 1 which is around 84%

In [None]:
df.corr()

Splitting the dataframe into features (X) and labels (y)

In [None]:
#Splitting dataframe into features (X) and output (y)
X=df.iloc[:,0:11]
y=df.iloc[:,11]

In [None]:
y

Now, the data is balance, we have equal percentage of data of every class type.

#  Model 1 - Continous Features

Defining the features

In [None]:
#Defining the dataframe of only continous features
X2= X.drop(['Gender'],axis=1)


In [None]:
len(y)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X2)
X_scaled[:3]

Splitting the dataset into traning ad test set

In [None]:
#Train Test split of the entire dataset for training purpose
X_train, X_test, y_train, y_test = train_test_split(X2 ,y , random_state=42,test_size=0.20, shuffle=True)

In [None]:
X2

In [None]:
y.value_counts()

Fitting the data to Gaussian Naive Bayes Classifier along with model performnace validation using accuracy score, classification report, confusion matrix

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score,recall_score #precision score

In [None]:
#Model Instatntiated
model1 = GaussianNB()
#Data fit to the model
model1.fit(X_train, y_train.values)
#Model Performance Evaluation
y_pred_train = model1.predict(X_train)
y_pred_test = model1.predict(X_test)
#print("Gaussian Naive Bayes model accuracy with continous features:", metrics.accuracy_score(y_test, y_pred)*100)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))



Fitting the data to Logistic Regression Classifier alongwith model performnace validation using accuracy score, classification report, confusion matrix and R2 score

In [None]:
#Model Instatntiated
model2 = LogisticRegression(multi_class='multinomial')

#Data fit to the model
model2.fit(X_train, y_train)
#Model Performance Evaluation
y_pred_train = model2.predict(X_train)
y_pred_test = model2.predict(X_test)
#print("Gaussian Naive Bayes model accuracy with continous features:", metrics.accuracy_score(y_test, y_pred)*100)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))



Lets use scaled features

In [None]:

#Train Test split of the entire dataset for training purpose
X_train, X_test, y_train, y_test = train_test_split(X_scaled ,y , random_state=42,test_size=0.20, shuffle=True)
#Model Instatntiated
model2 = LogisticRegression(multi_class='multinomial')

#Data fit to the model
model2.fit(X_train, y_train)
#Model Performance Evaluation
y_pred_train = model2.predict(X_train)
y_pred_test = model2.predict(X_test)
#print("Gaussian Naive Bayes model accuracy with continous features:", metrics.accuracy_score(y_test, y_pred)*100)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))



Fitting the data to Support Vector Machine Classifier alongwith model performnace validation using accuracy score, classification report, confusion matrix and R2 score

In [None]:
from sklearn.svm import SVC

In [None]:

#Train Test split of the entire dataset for training purpose
X_train, X_test, y_train, y_test = train_test_split(X ,y , random_state=42,test_size=0.20, shuffle=True)
#Model Instatntiated
model3 = SVC(kernel = 'linear', C = 1)

#Data fit to the model
model3.fit(X_train, y_train)

#Model Performance Evaluation
y_pred_train = model3.predict(X_train)
y_pred_test = model3.predict(X_test)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))

Fitting the data to K Neighbors Classifier with K=3 alongwith model performnace validation using accuracy score, classification report, confusion matrix and R2 score

In [None]:

#Train Test split of the entire dataset for training purpose
X_train, X_test, y_train, y_test = train_test_split(X ,y , random_state=42,test_size=0.20, shuffle=True)
#Model Instatntiated
model4 = KNeighborsClassifier(n_neighbors=3)

#Data fit to the model
model4.fit(X_train, y_train)

#Model Performance Evaluation
y_pred_train = model4.predict(X_train)
y_pred_test = model4.predict(X_test)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))

In [None]:
# calculating the accuracy of models with different values of k
Test_f_1_score = np.zeros(20)
for i in range(1,21):
    #Train Model and Predict  
    knn = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    yhat= knn.predict(X_test)
    Test_f_1_score[i-1] = f1_score(y_test, yhat,average='micro')

print(Test_f_1_score)

In [None]:
# calculating the accuracy of models with different values of k
Train_f_1_score = np.zeros(20)
for i in range(1,21):
    #Train Model and Predict  
    knn = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    yhat= knn.predict(X_train)
    Train_f_1_score[i-1] = f1_score(y_train, yhat,average='micro')

print(Train_f_1_score)

In [None]:
loc = np.arange(1,21,step=1.0)
plt.figure(figsize = (10, 6))
line1, = plt.plot(range(1,21), Train_f_1_score,label='Train F_1 score')
line2, = plt.plot(range(1,21), Test_f_1_score,linestyle = 'dashed',label='Test F_1 score')

plt.legend(handles=[line1, line2])
plt.xticks(loc)
plt.xlabel('Number of Neighbors ')
plt.ylabel('F1-Score')
plt.show()

  At K=7 the value of Train F1-score is 0.946 and Train F1-score is 0.945 which is very close and gives the optimal value of K.

In [None]:

#Train Test split of the entire dataset for training purpose
X_train, X_test, y_train, y_test = train_test_split(X ,y , random_state=0,test_size=0.20, shuffle=True)
#Model Instatntiated
model4 = KNeighborsClassifier(n_neighbors=7)

#Data fit to the model
model4.fit(X_train, y_train)

#Model Performance Evaluation
y_pred_train = model4.predict(X_train)
y_pred_test = model4.predict(X_test)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Train Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))

Fitting the data to Decision Tree Classifier alongwith model performnace validation using accuracy score, classification report, confusion matrix and R2 score

In [None]:
#Model Instatntiated
model5 = DecisionTreeClassifier(random_state=0)

#Data fit to the model
model5.fit(X_train, y_train)

#Model Performance Evaluation
y_pred_train = model5.predict(X_train)
y_pred_test = model5.predict(X_test)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Train Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))

Fitting the data to Random Forest Classifier alongwith model performnace validation using accuracy score, classification report, confusion matrix and R2 score

In [None]:
#Train Test split of the entire dataset for training purpose
X_train, X_test, y_train, y_test = train_test_split(X ,y , random_state=0,test_size=0.20, shuffle=True)
#Model Instatntiated
model6 = RandomForestClassifier(criterion='entropy',n_estimators=50)
model6.fit(X_train, y_train)
#Model Performance Evaluation
y_pred_train = model6.predict(X_train)
y_pred_test = model6.predict(X_test)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Train Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))

Random Forest Classifier Hyperparameter Tuning to find best set of parameters

In [None]:
#Hyperparameter Tuning using RandomisedSearch Cross Validation
param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start=2,stop=100,num=10)],
    'max_features': ['auto','sqrt','log2'],
    'max_depth': [int(x) for x in np.linspace(10,1000,10)],
    'min_samples_split': [2,5,7,10,12,14],
    'min_samples_leaf': [1,2,4,6,8],
    'criterion': ['entropy','gini']
}
print(param_grid)
rcv = RandomizedSearchCV(estimator=RandomForestClassifier(),param_distributions=param_grid,n_iter=100,cv=5,verbose=2,n_jobs=-1)
rcv.fit(X_train,y_train)
rcv.best_estimator_

Random Forest Classifier with the best set of parameters 

In [None]:
rcv.best_estimator_

In [None]:
#Train Test split of the entire dataset for training purpose
X_train, X_test, y_train, y_test = train_test_split(X ,y , random_state=0,test_size=0.20, shuffle=True)
#Model Instatntiated
#Model Instatntiated
model6 = RandomForestClassifier(criterion='entropy', max_depth=230, max_features='log2',
                       min_samples_leaf=4, n_estimators=89)
model6.fit(X_train, y_train)
#Model Performance Evaluation
y_pred_train = model6.predict(X_train)
y_pred_test = model6.predict(X_test)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Train Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))

Fitting the data to XGBoost Classifier alongwith model performnace validation using accuracy score, classification report, confusion matrix and R2 score

In [None]:
#Train Test split of the entire dataset for training purpose
X_train, X_test, y_train, y_test = train_test_split(X ,y , random_state=0,test_size=0.20, shuffle=True)
#Model Instatntiated
#Model Instatntiated
model7 = XGBClassifier()

#Data fit to the model
model7.fit(X_train, y_train)

model6.fit(X_train, y_train)
#Model Performance Evaluation
y_pred_train = model7.predict(X_train)
y_pred_test = model7.predict(X_test)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Train Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))

In [None]:
#Hyperparameter Tuning using RandomisedSearch Cross Validation
param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start=2,stop=100,num=10)],
    'max_depth': [int(x) for x in np.linspace(10,1000,10)],
    'min_samples_split': [2,5,7,10,12,14],
    'min_samples_leaf': [1,2,4,6,8],
    'criterion': ['entropy','gini'],
    'learning_rate': [0.01,0.1,0.2]
}
print(param_grid)
rcv = RandomizedSearchCV(estimator=XGBClassifier(),param_distributions=param_grid,n_iter=100,cv=5,verbose=2,n_jobs=-1)
rcv.fit(X_train,y_train)
rcv.best_estimator_

In [None]:
#Train Test split of the entire dataset for training purpose
X_train, X_test, y_train, y_test = train_test_split(X ,y , random_state=0,test_size=0.20, shuffle=True)
#Model Instatntiated
#Model Instatntiated
model8 = XGBClassifier(criterion='gini', max_depth=450, min_samples_leaf=4,
              min_samples_split=14, n_estimators=23,
              objective='multi:softprob')

#Data fit to the model
model8.fit(X_train, y_train)

model6.fit(X_train, y_train)
#Model Performance Evaluation
y_pred_train = model8.predict(X_train)
y_pred_test = model8.predict(X_test)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Train Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))

The XG Boost Classifier performs the best when it deals with only Continous Features  and inbalance datset with an Train F1-Score of 100% and Test F1-Score of 99% which makes it the best choice of model when trying to predict diabetes using the above mentioned data.

# Model 2 - All Features

Defining the features

In [None]:
#Defining the entire dataset with all features
X

Splitting the dataset into traning ad test set

In [None]:
#Train Test split of the entire dataset for training purpose
X_train, X_test, y_train, y_test = train_test_split(X ,y , random_state=0,test_size=0.20, shuffle=True)

Fitting the data to Gaussian Naive Bayes Classifier alongwith model performnace validation using accuracy score, classification report, confusion matrix and R2 score

In [None]:
#Model Instatntiated
mod1 = GaussianNB()

#Data fit to the model
mod1.fit(X_train, y_train)

y_pred_train = mod1.predict(X_train)
y_pred_test = mod1.predict(X_test)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Train Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))

Fitting the data to Logistic Regression Classifier alongwith model performnace validation using accuracy score, classification report, confusion matrix and R2 score

In [None]:
#Model Instatntiated
mod2 = LogisticRegression()

#Data fit to the model
mod2.fit(X_train, y_train)
y_pred_train = mod2.predict(X_train)
y_pred_test = mod2.predict(X_test)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Train Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))

Fitting the data to Support Vector Machine Classifier alongwith model performnace validation using accuracy score, classification report, confusion matrix and R2 score

In [None]:
#Model Instatntiated
mod3 = SVC(kernel='linear',C=1)

#Data fit to the model
mod3.fit(X_train, y_train)
y_pred_train = mod3.predict(X_train)
y_pred_test = mod3.predict(X_test)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Train Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))

Fitting the data to K Neighbors Classifier with K=3 alongwith model performnace validation using accuracy score, classification report, confusion matrix and R2 score

In [None]:
#Model Instatntiated
mod4 = KNeighborsClassifier(n_neighbors=3)

#Data fit to the model
mod4.fit(X_train, y_train)
y_pred_train = mod4.predict(X_train)
y_pred_test = mod4.predict(X_test)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Train Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))

In [None]:
# calculating the accuracy of models with different values of k
Train_f_1_score = np.zeros(20)
for i in range(1,21):
    #Train Model and Predict  
    knn = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    yhat= knn.predict(X_train)
    Train_f_1_score[i-1] = f1_score(y_train, yhat,average='micro')

print(Train_f_1_score)


# calculating the accuracy of models with different values of k
Test_f_1_score = np.zeros(20)
for i in range(1,21):
    #Train Model and Predict  
    knn = KNeighborsClassifier(n_neighbors = i).fit(X_test,y_test)
    yhat= knn.predict(X_test)
    Test_f_1_score[i-1] = f1_score(y_test, yhat,average='micro')

print(Test_f_1_score)

loc = np.arange(1,21,step=1.0)
plt.figure(figsize = (10, 6))
line1, = plt.plot(range(1,21), Train_f_1_score,label='Train F_1 score')
line2, = plt.plot(range(1,21), Test_f_1_score,linestyle = 'dashed',label='Test F_1 score')

plt.legend(handles=[line1, line2])
plt.xticks(loc)
plt.xlabel('Number of Neighbors ')
plt.ylabel('F1-Score')
plt.show()

When K=3 the F1-score of Train is 0.95 and Test is 0.93 which is the optimal values in the dataset.

Fitting the data to Decision Tree Classifier alongwith model performnace validation using accuracy score, classification report, confusion matrix and R2 score

In [None]:
#Model Instatntiated
mod5 = DecisionTreeClassifier(random_state=0)

#Data fit to the model
mod5.fit(X_train, y_train)

y_pred_train = mod5.predict(X_train)
y_pred_test = mod5.predict(X_test)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Train Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))

Fitting the data to Random Forest Classifier alongwith model performnace validation using accuracy score, classification report, confusion matrix and R2 score

In [None]:
#Model Instatntiated
mod6 = RandomForestClassifier(criterion='entropy',n_estimators=50)
mod6 = mod6.fit(X_train,y_train)
y_pred_train = mod6.predict(X_train)
y_pred_test = mod6.predict(X_test)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Train Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))

Random Forest Classifier Hyperparameter Tuning to find best set of parameters

In [None]:
param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start=2,stop=100,num=10)],
    'max_features': ['auto','sqrt','log2'],
    'max_depth': [int(x) for x in np.linspace(10,1000,10)],
    'min_samples_split': [2,5,7,10,12,14],
    'min_samples_leaf': [1,2,4,6,8],
    'criterion': ['entropy','gini']
}
print(param_grid)
rcv = RandomizedSearchCV(estimator=RandomForestClassifier(),param_distributions=param_grid,n_iter=100,cv=5,verbose=2,n_jobs=-1)
rcv.fit(X_train,y_train)
rcv.best_estimator_

In [None]:
rcv.best_estimator_

Random Forest Classifier with the best set of parameters 

In [None]:
#Model Instatntiated
mod6 = RandomForestClassifier(max_depth=780, max_features='sqrt', min_samples_split=5,
                       n_estimators=23)

#Data fit to the model
mod6.fit(X_train, y_train)

y_pred_train = mod6.predict(X_train)
y_pred_test = mod6.predict(X_test)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Train Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))

Fitting the data to XGBoost Classifier alongwith model performnace validation using accuracy score, classification report, confusion matrix and R2 score

In [None]:
#Model Instatntiated
mod7 = XGBClassifier()


#Data fit to the model
mod7.fit(X_train, y_train)

y_pred_train = mod7.predict(X_train)
y_pred_test = mod7.predict(X_test)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Train Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))

In [None]:
#Hyperparameter Tuning using RandomisedSearch Cross Validation
param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start=2,stop=100,num=10)],
    'max_depth': [int(x) for x in np.linspace(10,1000,10)],
    'min_samples_split': [2,5,7,10,12,14],
    'min_samples_leaf': [1,2,4,6,8],
    'criterion': ['entropy','gini'],
    'learning_rate': [0.01,0.1,0.2]
}
print(param_grid)
rcv = RandomizedSearchCV(estimator=XGBClassifier(),param_distributions=param_grid,n_iter=100,cv=5,verbose=2,n_jobs=-1)
rcv.fit(X_train,y_train)
rcv.best_estimator_

In [None]:
mod7 = XGBClassifier(criterion='entropy', max_depth=230, min_samples_leaf=1,
              min_samples_split=7, n_estimators=23, objective='multi:softprob')
#Data fit to the model
mod7.fit(X_train, y_train)

y_pred_train = mod7.predict(X_train)
y_pred_test = mod7.predict(X_test)

#Displaying Confusion Matrix 

cm_train = confusion_matrix(y_train,y_pred_train )
cm_test = confusion_matrix(y_test,y_pred_test )

fig, axes = plt.subplots(1, 2,figsize=(15, 5), sharey=True)
sns.heatmap(cm_train,annot=True,ax=axes[0])
sns.heatmap(cm_test,annot=True,ax=axes[1])
axes[0].set_title("Confusion Matrix on Train Data")
axes[1].set_title("Confusion Matrix on Test Data")
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predict')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predict')
plt.show()

precision_train=precision_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model precision for X_train and X_test 
print( "Precision on Train Data : ", str(int(precision_train*100)) +'%' )

precision_test=precision_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Precision on Test Data : ", str(int(precision_test*100)) +'%' ,)

recall_train=recall_score(y_train,y_pred_train,zero_division='warn',average='micro')

# model recall for X_train and X_test 

print( "Recall on Train Data : ", str(int(recall_train*100)) +'%' )

recall_test=recall_score(y_test,y_pred_test,zero_division='warn',average='micro')

print( "Recall on Test Data : ", str(int(recall_test*100)) +'%' ,)

# model F-1 score for X_train and X_test
F1_score_train =f1_score(y_train, y_pred_train,zero_division='warn',average='micro')

print( "F1 on Train Data : ", str(int(F1_score_train*100)) +'%' )

F1_score_test =f1_score(y_test, y_pred_test,zero_division='warn',average='micro')

print( "F1 on Test Data : ", str(int(F1_score_test*100)) +'%' )

#Displaying Classification Report

print("")
print("Classification Report for Train Data")
print("")
print(classification_report(y_train, y_pred_train))
print("")
print("Classification Report for Test Data")
print("")
print(classification_report(y_test, y_pred_test))

The XG Boost Classifier performs the best when it deals with All Features with an Train F1 Score of 100%, and Test F1-Score of 99%, which combinedly makes it the best choice of model when trying to predict diabetes using the above mentioned data.

# Regression

In [None]:
X=df.drop(['BMI'],axis=1)#Declaring Predictors
y=df['BMI']#Declaring Target

In [None]:
X

# Correlation between variables and output

We found out the Correlation between the various predictor variables and the output variable or target which is BMI in this case. The higher the values, the more correlated are the two variables.

Pearsons Correlation Between Gender and BMI

In [None]:
cor1,_=pearsonr(X['Gender'],y)
cor1

Spearman Kendall Correlation Between Gender and BMI

In [None]:
cor1,_=kendalltau(X['Gender'],y)
cor1

Pearsons Correlation Between Age and BMI

In [None]:
cor2,_=pearsonr(X['AGE'],y)
cor2

Spearman Kendall Correlation Between Age and BMI

In [None]:
cor2,_=kendalltau(X['AGE'],y)
cor2

Pearsons Correlation Between Urea and BMI

In [None]:
cor3,_=pearsonr(X['Urea'],y)
cor3

Spearman Kendall Correlation Between Urea and BMI

In [None]:
cor3,_=kendalltau(X['Urea'],y)
cor3

Pearsons Correlation Between Cr and BMI

In [None]:
cor4,_=pearsonr(X['Cr'],y)
cor4

Spearman Kendall Correlation Between Cr and BMI

In [None]:
cor4,_=kendalltau(X['Cr'],y)
cor4

Pearsons Correlation Between HbA1c and BMI

In [None]:
cor5,_=pearsonr(X['HbA1c'],y)
cor5

Spearman Kendall Correlation Between HbA1c and BMI

In [None]:
cor5,_=kendalltau(X['HbA1c'],y)
cor5

Pearsons Correlation Between Chol and BMI

In [None]:
cor6,_=pearsonr(X['Chol'],y)
cor6

Spearman Kendall Correlation Between Chol and BMI

In [None]:
cor6,_=kendalltau(X['Chol'],y)
cor6

Pearsons Correlation Between TG and BMI

In [None]:
cor7,_=pearsonr(X['TG'],y)
cor7

Spearman Kendall Correlation Between TG and BMI

In [None]:
cor7,_=kendalltau(X['TG'],y)
cor7

Pearsons Correlation Between HDL and BMI

In [None]:
cor8,_=pearsonr(X['HDL'],y)
cor8

Spearman Kendall Correlation Between HDL and BMI

In [None]:
cor8,_=kendalltau(X['HDL'],y)
cor8

Pearsons Correlation Between LDL and BMI

In [None]:
cor9,_=pearsonr(X['LDL'],y)
cor9

Spearman Kendall Correlation Between LDL and BMI

In [None]:
cor9,_=kendalltau(X['LDL'],y)
cor9

Pearsons Correlation Between VLDL and BMI

In [None]:
cor10,_=pearsonr(X['VLDL'],y)
cor10

Spearman Kendall Correlation Between VLDL and BMI

In [None]:
cor10,_=kendalltau(X['VLDL'],y)
cor10

Pearsons Correlation Between CLASS and BMI

In [None]:
cor11,_=pearsonr(X['CLASS'],y)
cor11

Spearman Kendall Correlation Between CLASS and BMI

In [None]:
cor11,_=kendalltau(X['CLASS'],y)
cor11

# Linear Regression

Linear Regression is a no regularised regression technique. This performs and has a R2 score of 0.39 as per our dataset in deteming BMI based on other features.

In [None]:
#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=1)
  
# create linear regression object
reg = linear_model.LinearRegression()
  
# train the model using the training sets
reg.fit(X_train, y_train)

#Output Prediction
y_pred=reg.predict(X_test)
  
# regression coefficients
print('Coefficients: ', reg.coef_)
  
# variance score: 1 means perfect prediction
print('Variance score: {}'.format(reg.score(X_test, y_test)))

# R2 Score
print('R2 score: {}'.format(r2_score(y_test,y_pred)))

# Mean Squared Error
print('Mean Squared Error: {}'.format(mean_squared_error(y_test,y_pred)))

# Lasso Regression

Lasso Regression is a regularised regression technique. This performs and has a R2 score of 0.40 as per our dataset in deteming BMI based on other features.

In [None]:
#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=1)
  
# create linear regression object
reg = linear_model.Lasso(alpha=0.1)
  
# train the model using the training sets
reg.fit(X_train, y_train)

#Output Prediction
y_pred=reg.predict(X_test)
  
# regression coefficients
print('Coefficients: ', reg.coef_)
  
# variance score: 1 means perfect prediction
print('Variance score: {}'.format(reg.score(X_test, y_test)))

# R2 Score
print('R2 score: {}'.format(r2_score(y_test,y_pred)))

# Mean Squared Error
print('Mean Squared Error: {}'.format(mean_squared_error(y_test,y_pred)))

# Ridge Regression

Ridge Regression is a regularised regression technique. This performs and has a R2 score of 0.39 as per our dataset in deteming BMI based on other features.

In [None]:
#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=1)
  
# create linear regression object
reg = linear_model.Ridge(alpha=1.0)
  
# train the model using the training sets
reg.fit(X_train, y_train)

#Output Prediction
y_pred=reg.predict(X_test)
  
# regression coefficients
print('Coefficients: ', reg.coef_)
  
# variance score: 1 means perfect prediction
print('Variance score: {}'.format(reg.score(X_test, y_test)))

# R2 Score
print('R2 score: {}'.format(r2_score(y_test,y_pred)))

# Mean Squared Error
print('Mean Squared Error: {}'.format(mean_squared_error(y_test,y_pred)))

# Elastic Net Regression

Elastic Net Regression is a regularised regression technique. This performs and has a R2 score of 0.34 as per our dataset in deteming BMI based on other features.

In [None]:
#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=1)
  
# create linear regression object
reg = linear_model.ElasticNet(random_state=0)
  
# train the model using the training sets
reg.fit(X_train, y_train)

#Output Prediction
y_pred=reg.predict(X_test)
  
# regression coefficients
print('Coefficients: ', reg.coef_)
  
# variance score: 1 means perfect prediction
print('Variance score: {}'.format(reg.score(X_test, y_test)))

# R2 Score
print('R2 score: {}'.format(r2_score(y_test,y_pred)))

# Mean Squared Error
print('Mean Squared Error: {}'.format(mean_squared_error(y_test,y_pred)))

Random Forest Regressor 

In [None]:
from sklearn.ensemble import RandomForestRegressor
#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=1)
  
# create linear regression object
rf = RandomForestRegressor(n_estimators = 300, max_features = 'sqrt', max_depth = 5, random_state = 18).fit(  X_train, y_train)
  
# train the model using the training sets
rf.fit(X_train, y_train)

#Output Prediction
y_pred=rf.predict(X_test)

# variance score: 1 means perfect prediction
print('Variance score: {}'.format(rf.score(X_test, y_test)))

# R2 Score
print('R2 score: {}'.format(r2_score(y_test,y_pred)))

# Mean Squared Error
print('Mean Squared Error: {}'.format(mean_squared_error(y_test,y_pred)))

In [None]:
import datetime as datetime
from sklearn.model_selection import GridSearchCV
## Define Grid 
grid = { 
    'n_estimators': [200,300,400,500],
    'max_features': ['sqrt','log2'],
    'max_depth' : [3,4,5,6,7],
    'random_state' : [18]
}
## show start time
print(datetime.datetime.now())
## Grid Search function
CV_rfr = GridSearchCV(estimator=RandomForestRegressor(), param_grid=grid, cv= 5)
CV_rfr.fit(X_train, y_train)
## show end time
print(datetime.datetime.now())

In [None]:
CV_rfr.best_estimator_

In [None]:
# create linear regression object
rf = RandomForestRegressor(max_depth=7, max_features='sqrt', n_estimators=500,
                      random_state=18).fit(  X_train, y_train)
  
# train the model using the training sets
rf.fit(X_train, y_train)

#Output Prediction
y_pred=rf.predict(X_test)

# variance score: 1 means perfect prediction
print('Variance score: {}'.format(rf.score(X_test, y_test)))

# R2 Score
print('R2 score: {}'.format(r2_score(y_test,y_pred)))

# Mean Squared Error
print('Mean Squared Error: {}'.format(mean_squared_error(y_test,y_pred)))

**Gradient Boosting Regressor**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=1)
  
# train the model using the training sets
gbr = GradientBoostingRegressor().fit(X_train,y_train)

#Output Prediction
y_pred=gbr.predict(X_test)

# variance score: 1 means perfect prediction
print('Variance score: {}'.format(gbr.score(X_test, y_test)))

# R2 Score
print('R2 score: {}'.format(r2_score(y_test,y_pred)))

# Mean Squared Error
print('Mean Squared Error: {}'.format(mean_squared_error(y_test,y_pred)))



In [None]:
param_grid = {'n_estimators':[200,300,400,500],
              'max_depth':[3,4,5,6,7], #range(5,16,2), 
              'min_samples_split':[50,100], #range(200,1001,200), 
              'learning_rate':[0.01,0.1,0.2],
              'random_state' : [18]}

## show start time
print(datetime.datetime.now())
## Grid Search function
CV_rfr = GridSearchCV(estimator=GradientBoostingRegressor(), param_grid=param_grid, cv= 5)
CV_rfr.fit(X_train, y_train)
## show end time
print(datetime.datetime.now())

In [None]:
CV_rfr.best_estimator_

In [None]:
#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=1)
# train the model using the training sets
gbr = GradientBoostingRegressor(learning_rate=0.01, max_depth=7, min_samples_split=50,
                          n_estimators=500, random_state=18).fit(X_train,y_train)

#Output Prediction
y_pred=gbr.predict(X_test)

# variance score: 1 means perfect prediction
print('Variance score: {}'.format(gbr.score(X_test, y_test)))

# R2 Score
print('R2 score: {}'.format(r2_score(y_test,y_pred)))

# Mean Squared Error
print('Mean Squared Error: {}'.format(mean_squared_error(y_test,y_pred)))

# Perfomance Validation

So as per the experimentation Lasso Regression performs the best on our given dataset with an R2 score of 0.40 which is the highest among other models. Also it has a mean squared error of 13.98 which says it has the lowest error and confirms that this performs the best.