In [None]:
pip install numpy pandas matplotlib

In [None]:
pip install seaborn scipy warnings

In [None]:
pip install streamlit

In [6]:
pip install sklearn

Collecting sklearn
  Using cached sklearn-0.0.tar.gz (1.1 kB)
Collecting scikit-learn
  Using cached scikit_learn-0.24.2-cp39-cp39-macosx_10_13_x86_64.whl (7.3 MB)
Collecting joblib>=0.11
  Using cached joblib-1.0.1-py3-none-any.whl (303 kB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Using legacy 'setup.py install' for sklearn, since package 'wheel' is not installed.
Installing collected packages: joblib, threadpoolctl, scikit-learn, sklearn
    Running setup.py install for sklearn ... [?25ldone
[?25hSuccessfully installed joblib-1.0.1 scikit-learn-0.24.2 sklearn-0.0 threadpoolctl-2.1.0
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3.9 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [72]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score


#For Plotting
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix 
from scipy.stats import gaussian_kde


#For Machine Learning Algorithms
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC


In [9]:
df = pd.read_csv('/Users/thejakamahaulpatha/PycharmProjects/Diabetese/Datasets/diabetes.csv')

In [None]:
######### Descriptive Analysis #########

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

In [37]:
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0
2,0,145,0,0,0,44.2,0.63,31,1
3,0,135,68,42,250,42.3,0.365,24,1
4,1,139,62,41,480,40.7,0.536,21,0


In [None]:
df.Pregnancies.unique()

In [None]:
df.describe()

In [None]:
######### Descriptive Analysis with plotting #########

In [None]:
######### 1. Histrograms

In [None]:
fig, ax = plt.subplots(9,figsize=(15,30))
fig.patch.set_facecolor('#f6f5f7')
# ax.set_facecolor('#f6f5f5')

for i in range(9):
    df.hist(column=df.columns[i],bins=12, ax=ax[i], alpha=0.5, color='red')

In [None]:
######### Another way of showing the histrograms 

In [None]:
numbers = pd.Series(df.columns)
df[numbers].hist(figsize = (14, 14))
plt.show();

In [None]:
######### 2. Boxplots

In [None]:
df.plot(kind = 'box', subplots = True, layout = (3, 3), sharex = False, sharey = False, figsize = (12, 12));

In [None]:
######### 3. Density Plots

In [None]:
df.plot(kind = 'density', subplots = True, layout = (3, 3), sharex = False, sharey = False, figsize = (12, 12));

In [None]:
######### 4. Scatter Matrix and Scatter Plots

In [None]:
scatter_matrix(df, alpha = 0.2, figsize = (50, 50))       

In [None]:
######### A better way to do the same

In [None]:
sns.pairplot(df, hue = "Outcome");

In [None]:
x = df['SkinThickness']
y = df['Outcome']

xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)

plt.scatter(x, y, c=z, s=100)
plt.show()

In [None]:
g = sns.FacetGrid(df, col = 'Outcome')
g.map(sns.scatterplot, 'SkinThickness','Outcome')

In [None]:
#Comparing the means of SkinThickness based on Outcome 

a = df[(df["Outcome"] == 0)]
b = df[(df["Outcome"] == 1)]

print(a['SkinThickness'].mean(),b['SkinThickness'].mean())


# We can see that the mean of Skinthickness where the Outcome is 1 has a greater value. 

In [None]:
#Checking how many null values are in the SkinThickness column. 

df.SkinThickness.count()

c = df[df['SkinThickness']==0]
count  = c.SkinThickness.count()

avg_skinThickness = count / df.SkinThickness.count()

print(avg_skinThickness*100)


In [None]:
#Comparing the means of SkinThickness based on Outcome but excluding '0' in Skinthickness as there are 28% values of 0s 

a = df[(df["Outcome"] == 0) & (df["SkinThickness"] > 0)]
b = df[(df["Outcome"] == 1) & (df["SkinThickness"] > 0)]

print(a['SkinThickness'].mean(),b['SkinThickness'].mean())


In [None]:
n = df[df["SkinThickness"]>0]

g = sns.FacetGrid(n, col = 'Outcome')
g.map(sns.scatterplot, 'SkinThickness','Outcome')


In [None]:
######### 5. Heatmaps

In [None]:
# See the correlation of the variables using a Heatmap
corr = df.corr()
f, ax = plt.subplots(figsize = (20, 9))
sns.heatmap(corr, vmax = 1, annot = True);

In [None]:
######### Implementing Machine Learning Model
# We will use a RandomForestClassification

In [64]:
# 1. Let's split the data

x = df.iloc[:,0:8].values
y = df.iloc[:,-1].values

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=1) 

In [65]:
#2. Fitting the date to the model

rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)

RandomForestClassifier()

In [66]:
# Predicting on the test data to see the accuracy of the mode

round(float(accuracy_score(y_test,rfc.predict(x_test)))*100)

97

In [None]:
######### Let's test some more Algorithms 

In [74]:
#Creating a list with the models
models = []
models.append(('LR',LogisticRegression(solver='liblinear',multi_class='ovr')))
models.append(('LDA',LinearDiscriminantAnalysis()))
models.append(('KNN',KNeighborsClassifier()))
models.append(('CART',DecisionTreeClassifier()))
models.append(('NB',GaussianNB()))
models.append(('RFC',RandomForestClassifier()))
models.append(('SVM',SVC(gamma='auto')))

In [79]:
# Evaluate each model in turn
results = []
names = []

for name,model in models:
    kfold = StratifiedKFold(n_splits=10,random_state=1,shuffle=True)
    cv_results = cross_val_score(model,x_train,y_train,cv=kfold,scoring='accuracy')
#     print(cv_results)
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)'%(name,cv_results.mean(),cv_results.std()))


LR: 0.774000 (0.014742)
LDA: 0.774000 (0.020753)
KNN: 0.776000 (0.033625)
CART: 0.954667 (0.025785)
NB: 0.756000 (0.031156)
RFC: 0.960667 (0.016452)
SVM: 0.944667 (0.017651)
