# Diabetes classification

# **Importing the libraries**

1.   **NumPy** is used for numerical computing.
2.   **Pandas** is used for data manipulation.
3.   **Matplotlib** is a plotting library for creating visualizations.
4.   **Seaborn** is a data visualization library creating attractive statistical graphics, such as heatmaps etc.
5.   **Scikit-learn**, this function is used to split datasets into training and testing sets for machine learning model evaluation.
6.   **KNeighborsClassifier** is a classification algorithm from scikit-learn that implements the k-nearest neighbors algorithm for classification tasks.
7.   **RandomForestClassifier**  builds multiple decision trees during training and combines their predictions to improve accuracy and reduce overfitting.
8.   **make_classification** is a function in scikit-learn used to generate synthetic classification datasets for testing machine learning models.
9.   **Support** **Vector** **Machines** (SVM) are supervised learning models used for classification, regression, and outlier detection.
10.  **Metrics** for evaluating the performance of machine learning models, such as accuracy, precision, recall etc
11.  **sklearn.preprocessing** Provides functions for preprocessing data before feeding it into machine learning models.
12.   **confusion_matrix** calculates a confusion matrix to assess the model's predictions.
    







In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn import svm


from sklearn import metrics
from sklearn import preprocessing


from warnings import filterwarnings
import requests
from sklearn.metrics import hinge_loss , confusion_matrix , classification_report

from warnings import filterwarnings

filterwarnings("ignore")


#  Data Loading and Initial Analysis


In [None]:
#reading the data
d_data = pd.read_csv("diabetes.csv")
d_data

FileNotFoundError: [Errno 2] No such file or directory: 'diabetes.csv'

In [None]:
#Converting into Data Frame

df = pd.DataFrame(d_data)
df.describe(include='all')


In [None]:
#checking for the missing values
df.isnull().sum()

# Data Cleaning

In [None]:
df.Age = abs(df.Age)
df.Age = round(df.Age)
df.Insulin = abs(df.Insulin)

In [None]:
df.describe(include='all')

In [None]:
df.info()

# Expoloratory Data Analysis

In [None]:
for column in df.drop(columns=['Outcome','Pregnancies','BloodPressure']).columns :
    plt.figure(figsize=(15,10))
    sns.relplot(data = df,x = column ,y = 'BloodPressure' ,hue = 'Outcome')
    plt.title(column+" & BloodPressure")
    plt.xticks(rotation=90)
    plt.xlabel(column)
    plt.ylabel('BloodPressure')
    plt.show()

In [None]:
#correlation heatmap

corr= df.corr()
plt.figure(figsize=(20,15))
matrix = np.triu(corr)
sns.heatmap(corr, annot=True, mask=matrix)


In [None]:
#Histograms of all features

df.hist(bins=50,figsize=(20,15),color = '#f77ea8')
plt.show()


In [None]:
#Box plots of all features

for column in df.drop('Outcome',axis = 1) :
    sns.boxplot(x  = column, data = df,color='#88f8d1')
    plt.title(column)
    plt.grid()
    plt.show()


In [None]:
#Countplot for Pregnancies by Outcome


fig, ax1 = plt.subplots(figsize=(20,10))
plt.grid()
sns.set_theme(style="whitegrid")
plt.title('Number of pregnancies based on having diabetes or not',fontsize = 22)
graph = sns.countplot(ax=ax1,x='Pregnancies', data=df , order = df['Pregnancies'].value_counts().index, hue='Outcome',)
graph.set_xticklabels(graph.get_xticklabels())


for p in graph.patches:
    graph.annotate(p.get_height(), (p.get_x(), p.get_height()+1),fontsize=11,rotation=45)


# Data Normalization and Splitting


1.   Drops the Outcome column from the DataFrame to focus on features.
2.   Initializes a Min-Max scaler to scale features between 0 and 1.




In [None]:
dfx = df.drop(['Outcome'],axis = 1)


In [None]:
Scaler = preprocessing.MinMaxScaler(feature_range = (0,1))
Norm = Scaler.fit_transform(dfx)
Norm_df = pd.DataFrame(Norm,columns=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin',
                                     'BMI','DiabetesPedigreeFunction','Age'] )


In [None]:
x = Norm_df
y = df['Outcome'].values.reshape(-1,1)

# Train-test splitting

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x,y,random_state = 1, test_size= 0.2)


# K-Nearest Neighbors(Uniform weights)

In [None]:
k = 6
acc = np.zeros((k))

for i in range(1,k+1):
    model = KNeighborsClassifier(n_neighbors= i )
    model.fit(x_train,y_train.ravel())
    y_pred = model.predict(x_test)
    acc[i-1] = metrics.accuracy_score(y_test , y_pred)
acc

# n_neighbors = 7 is the best(0.81168831)


In [None]:
#plotting train and test accurary

train_acc = []
test_acc = []

neighbors_setting = range(1,6)

for n_neighbor in neighbors_setting :
    model = KNeighborsClassifier(n_neighbors= n_neighbor)
    model.fit(x_train,y_train.ravel())
    train_acc.append(model.score(x_train , y_train))
    test_acc.append(model.score(x_test,y_test))

plt.plot(neighbors_setting, train_acc,label = 'train')
plt.plot(neighbors_setting,test_acc , label = 'test')
plt.xlabel('number of K')
plt.ylabel('accuracy')
plt.legend()


In [None]:
#Finding the best value for K and the final model

kmodel = KNeighborsClassifier(n_neighbors= 7)
kmodel.fit(x_train,y_train.ravel())
y_pred = kmodel.predict(x_test)


In [None]:
print ("Accuracy : ", metrics.accuracy_score(y_test, y_pred))


In [None]:
#confusion matrix and claasification report for KNN

aq = confusion_matrix(y_test,y_pred)

fig , ax = plt.subplots(figsize = (8,8))
ax.imshow(aq)
ax.grid(False)
ax.xaxis.set(ticks= (0,1), ticklabels= ('predicted 0s','predicted 1s'))
ax.yaxis.set(ticks= (0,1), ticklabels= ('actual 0s','actual 1s'))
ax.set_ylim(1.5,-0.5)
for i in range(2) :
    for j in range(2) :
        ax.text(j,i,aq[i,j] ,ha = 'center',va = 'center' , color = '#cb1c8b' )
plt.show()

In [None]:
print(classification_report(y_test,y_pred))

# Random Forest classifier

In [None]:
rmodel = RandomForestClassifier(max_depth= 2, random_state=0)
rmodel.fit(x_train,y_train.ravel())

y_pred = rmodel.predict(x_test)


In [None]:
print ("Accuracy : ", metrics.accuracy_score(y_test, y_pred))


In [None]:
#Confusion matrix for Random forest
aq = confusion_matrix(y_test,y_pred)

fig , ax = plt.subplots(figsize = (8,8))
ax.imshow(aq)
ax.grid(False)
ax.xaxis.set(ticks= (0,1), ticklabels= ('predicted 0s','predicted 1s'))
ax.yaxis.set(ticks= (0,1), ticklabels= ('actual 0s','actual 1s'))
ax.set_ylim(1.5,-0.5)
for i in range(2) :
    for j in range(2) :
        ax.text(j,i,aq[i,j] ,ha = 'center',va = 'center' , color = '#cb1c8b' )
plt.show()

In [None]:
print(classification_report(y_test,y_pred))


In [None]:
df1 = df[df['Outcome'] == 1]


In [None]:
df2 = pd.concat([df, df1], ignore_index=True)
df2


In [None]:
dfx = df2.drop(['Outcome'],axis = 1)


In [None]:
Scaler = preprocessing.MinMaxScaler(feature_range = (0,1))
Norm = Scaler.fit_transform(dfx)
Norm_df = pd.DataFrame(Norm,columns=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin',
                                     'BMI','DiabetesPedigreeFunction','Age'] )


In [None]:
x = Norm_df
y = df2['Outcome'].values.reshape(-1,1)


# KNN model(Distance weights)

In [None]:
k = 20
acc = np.zeros((k))

for i in range(1,k+1):
    model = KNeighborsClassifier(n_neighbors= i,weights='distance' )
    model.fit(x_train,y_train.ravel())
    y_pred = model.predict(x_test)
    acc[i-1] = metrics.accuracy_score(y_test , y_pred)
acc

In [None]:
kmodel = KNeighborsClassifier(n_neighbors= 16,weights='distance')
kmodel.fit(x_train,y_train.ravel())


y_pred = kmodel.predict(x_test)


In [None]:
print ("Accuracy : ", metrics.accuracy_score(y_test, y_pred))
#k=16


In [None]:
#confusion matrix
aq = confusion_matrix(y_test,y_pred)

fig , ax = plt.subplots(figsize = (8,8))
ax.imshow(aq)
ax.grid(False)
ax.xaxis.set(ticks= (0,1), ticklabels= ('predicted 0s','predicted 1s'))
ax.yaxis.set(ticks= (0,1), ticklabels= ('actual 0s','actual 1s'))
ax.set_ylim(1.5,-0.5)
for i in range(2) :
    for j in range(2) :
        ax.text(j,i,aq[i,j] ,ha = 'center',va = 'center',color = '#cb1c8b')
plt.show()

In [None]:
print(classification_report(y_test,y_pred))


# Random Forest classifier

In [None]:
rmodel = RandomForestClassifier( max_depth= 20,n_estimators=500,criterion= 'entropy',random_state=0)
rmodel.fit(x_train,y_train.ravel())

y_pred = rmodel.predict(x_test)


In [None]:
print ("Accuracy : ", metrics.accuracy_score(y_test, y_pred))


In [None]:
#confusion matrix for Random forest

aq = confusion_matrix(y_test,y_pred)

fig , ax = plt.subplots(figsize = (8,8))
ax.imshow(aq)
ax.grid(False)
ax.xaxis.set(ticks= (0,1), ticklabels= ('predicted 0s','predicted 1s'))
ax.yaxis.set(ticks= (0,1), ticklabels= ('actual 0s','actual 1s'))
ax.set_ylim(1.5,-0.5)
for i in range(2) :
    for j in range(2) :
        ax.text(j,i,aq[i,j] ,ha = 'center',va = 'center',color = '#cb1c8b')
plt.show()