In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [None]:
import warnings
warnings.filterwarnings(action="ignore")


In [None]:
%matplotlib inline

In [None]:
dataset = pd.read_csv("diebeties.csv")
dataset.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
dataset.head()

In [None]:
# to get mathematical aspects of dataset
dataset.describe()

In [None]:
# to check the number of null rows 
dataset.info()

In [None]:
# to verify 
dataset.isnull().sum()

In [None]:
# to check the duplicated rows...
dataset.duplicated().sum()

In [None]:
plt.figure()
corr_data=dataset.corr(method="pearson")
#pearson is the default also and it calculates linear or non linear that is any sort of correlation!
sns.heatmap(corr_data)
plt.title('correlation heatmap')

In [None]:
plt.figure()
kde = sns.kdeplot(dataset['Pregnancies'][dataset['Outcome'] == 1], color = 'red', fill = True)
kde = sns.kdeplot(dataset['Pregnancies'][dataset['Outcome'] == 0], color = 'blue', fill = True)
kde.set_xlabel('Pregnancies')
kde.set_ylabel('density')
kde.legend(['positive', 'negative'])

In [None]:
plt.figure()
kde = sns.kdeplot(dataset['Glucose'][dataset['Outcome'] == 1], color = 'red', fill = True)
kde = sns.kdeplot(dataset['Glucose'][dataset['Outcome'] == 0], color = 'blue', fill = True)
kde.set_xlabel('Glucose')
kde.set_ylabel('density')
kde.legend(['positive', 'negative'])

In [None]:
plt.figure()
kde = sns.kdeplot(dataset['BMI'][dataset['Outcome'] == 1], color = 'red', fill = True)
kde = sns.kdeplot(dataset['BMI'][dataset['Outcome'] == 0], color = 'blue', fill = True)
kde.set_xlabel('BMI')
kde.set_ylabel('density')
kde.legend(['positive', 'negative'])

In [None]:
plt.figure(figsize = (10,8))
sns.violinplot(data = dataset, x = 'Outcome', y = 'Glucose', split = True, linewidth=2, inner= 'quart')

In [None]:
dataset['Glucose'] = dataset['Glucose'].replace(0,dataset['Glucose'][dataset['Glucose']!=0].mean())
dataset['BloodPressure'] = dataset['BloodPressure'].replace(0,dataset['BloodPressure'][dataset['BloodPressure']!=0].mean())
dataset['Insulin'] = dataset['Insulin'].replace(0,dataset['Insulin'][dataset['Insulin']!=0].mean())
dataset['BMI'] = dataset['BMI'].replace(0,dataset['BMI'][dataset['BMI']!=0].mean())
dataset['SkinThickness'] = dataset['SkinThickness'].replace(0,dataset['SkinThickness'][dataset['SkinThickness']!=0].mean())

In [None]:
dataset

In [None]:
x = dataset.drop(['Outcome'], axis = 1)
y = dataset['Outcome']

In [None]:
x

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [None]:
x_train

In [None]:
x_test

In [None]:
y_train

In [None]:
y_test

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
training_accuracy = []
test_accuracy = []
for n_neighbors in range(1,11):
    knn = KNeighborsClassifier(n_neighbors = n_neighbors)
    knn.fit(x_train, y_train)
    y_test=knn.predict(x_test)
    training_accuracy.append(knn.score(x_train, y_train))
    test_accuracy.append(knn.score(x_test, y_test))

In [None]:
print('Training Accuracy:', training_accuracy)
print('Test Accuracy:', test_accuracy)

In [None]:
plt.plot(range(1,11), training_accuracy, label = 'training accuracy')
plt.plot(range(1,11), test_accuracy, label = 'test accuracy')
plt.xlabel('n_neighbors')
plt.ylabel('accuracy')
plt.legend()

In [None]:
knn = KNeighborsClassifier(n_neighbors = 9)
knn.fit(x_train, y_train)
print(knn.score(x_train, y_train), 'training score')
print(knn.score(x_test, y_test), 'test score')

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state = 0)
dt.fit(x_train, y_train)
print(dt.score(x_train, y_train), 'training accuracy')
print(dt.score(x_test, y_test), 'test accuracy')

In [None]:
dt1 = DecisionTreeClassifier(random_state = 0, max_depth = 3)
dt1.fit(x_train, y_train)
print(dt1.score(x_train, y_train), 'training accuracy')
print(dt1.score(x_test, y_test), 'test accuracy')

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(random_state = 42)
mlp.fit(x_train, y_train)
print(mlp.score(x_train, y_train), 'training accuracy')
print(mlp.score(x_test, y_test), 'test accuracy')

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train_scaled = sc.fit_transform(x_train)
x_test_scaled = sc.fit_transform(x_test)

In [None]:
mlp1 = MLPClassifier(random_state = 42)
mlp1.fit(x_train_scaled, y_train)
print(mlp1.score(x_train_scaled, y_train), 'training accuracy')
print(mlp1.score(x_test_scaled, y_test), 'test accuracy')