In [162]:
# Add dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns
sns.set()  #  Will import Seaborn functionalities

import warnings
warnings.filterwarnings('ignore')

# Data Understanding EDA

In [163]:
df =pd.read_csv('Resources/diabetes.csv')
df.tail()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [164]:
type(df)

pandas.core.frame.DataFrame

In [165]:
df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [166]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [167]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [168]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [169]:
df['Outcome'].value_counts()
#df.groupby("Outcome").size()

0    500
1    268
Name: Outcome, dtype: int64

In [170]:
df['Outcome'].value_counts(normalize=True)


0    0.651042
1    0.348958
Name: Outcome, dtype: float64

# Machine Learning Models

<!-- 1 - Need to identify X and y/
2 - Split our data using the Train-Test-Split Method (80% vs 20%)
3 - Import the model (library)
4 - Initiate the model engine - by simply calling the model/function
5 - Train the model on the 80% (X and y) using fit() method
6 - Test the model on the 20% (X only) using the predict() method
7 - We compare the Predicted outcome with the Actual outcome
8 - We make the decision -->

# X, y 

In [171]:
# Drop the diabetes values and set the X to the remaining data.
X = df.drop("Outcome", axis=1)
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [172]:
X.shape

(768, 8)

In [173]:
# Set the y variable to the "Outcome" column.
y = df_copy["Outcome"]
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [174]:
y.shape

(768,)

In [175]:
y.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

## Split our data into training and testing data

In [176]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Dummy Classifier

In [177]:
from sklearn.dummy import DummyClassifier

dummy_frequent_clf = DummyClassifier(strategy="most_frequent")

dummy_frequent_clf.fit(X_test, y_test)

DummyClassifier(strategy='most_frequent')

In [178]:
y_pred =dummy_frequent_clf.predict(X_test)
print(y_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0]


In [179]:
score_train = dummy_frequent_clf.score(X_train, y_train)
score_test = dummy_frequent_clf.score(X_test, y_test)
print('The training fraction of correct classifications is: {:5.3f}'.format(score_train))
print('The testing fraction of correct classifications is: {:5.3f}'.format(score_test))


The training fraction of correct classifications is: 0.655
The testing fraction of correct classifications is: 0.641


# Confusion Matrix

In [180]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [181]:
CM = confusion_matrix(y_test,y_pred)
CM

array([[123,   0],
       [ 69,   0]], dtype=int64)

In [182]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      1.00      0.78       123
           1       0.00      0.00      0.00        69

    accuracy                           0.64       192
   macro avg       0.32      0.50      0.39       192
weighted avg       0.41      0.64      0.50       192



## Feature scaling

In [183]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Logistic Regression

In [184]:
# Import the model
from sklearn.linear_model import LogisticRegression
# Instantiate a logistic regression model
classifier = LogisticRegression(max_iter=10000)
# Train the Model we use fit () 
classifier.fit(X_train_scaled, y_train)
# validate the model we use predict()
y_pred = classifier.predict(X_test_scaled)

##  predicted values Vs actual values

In [185]:
# we validate the model, or evaluate its performance by using accuracy score. 
values = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})
print(values.head(30))
print(accuracy_score( y_pred,y_test))

     Prediction  Actual
668           0       0
324           0       0
624           0       0
690           0       0
473           0       0
204           0       0
97            0       0
336           1       0
568           1       0
148           1       0
667           0       1
212           1       0
199           0       1
265           0       0
760           0       0
356           0       1
501           0       0
457           0       0
604           1       1
213           1       1
636           0       0
544           0       0
86            1       0
208           0       0
281           1       0
209           1       1
581           0       0
639           0       0
328           0       1
431           0       0
0.7291666666666666


# confusion_matrix

In [189]:
lr_CM = confusion_matrix(y_test,y_pred)
lr_CM

array([[95, 28],
       [24, 45]], dtype=int64)

In [190]:
# Create a DataFrame from the confusion matrix.
lr_df = pd.DataFrame(
    lr_CM, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

lr_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,95,28
Actual 1,24,45


In [191]:
target_names = ["negative", "positive"]
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

    negative       0.80      0.77      0.79       123
    positive       0.62      0.65      0.63        69

    accuracy                           0.73       192
   macro avg       0.71      0.71      0.71       192
weighted avg       0.73      0.73      0.73       192



In [192]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.7795138888888888
Testing Data Score: 0.7291666666666666


## Random Forests

In [150]:
# Import a Random Forests classifier
from sklearn.ensemble import RandomForestClassifier
# Fit a model, and then print a classification report
clf = RandomForestClassifier(random_state=1)
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

              precision    recall  f1-score   support

    negative       0.81      0.81      0.81       123
    positive       0.67      0.67      0.67        69

    accuracy                           0.76       192
   macro avg       0.74      0.74      0.74       192
weighted avg       0.76      0.76      0.76       192

Training Score: 1.0
Testing Score: 0.7604166666666666


In [151]:
rf_cm = confusion_matrix(y_test, y_pred)
rf_cm

array([[100,  23],
       [ 23,  46]], dtype=int64)

## ExtraTreesClassifier

In [152]:
# Import an Extremely Random Trees classifier
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(random_state=1)
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

              precision    recall  f1-score   support

    negative       0.79      0.81      0.80       123
    positive       0.65      0.61      0.63        69

    accuracy                           0.74       192
   macro avg       0.72      0.71      0.71       192
weighted avg       0.74      0.74      0.74       192

Training Score: 1.0
Testing Score: 0.7395833333333334


In [153]:
ex_cm = confusion_matrix(y_test, y_pred)
ex_cm

array([[100,  23],
       [ 27,  42]], dtype=int64)

## Adoptive Boosting classfier

In [154]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(random_state=1).fit(X_train_scaled, y_train)
clf

AdaBoostClassifier(random_state=1)

In [155]:
adb_cm = confusion_matrix(y_test, y_pred)
adb_cm

array([[100,  23],
       [ 27,  42]], dtype=int64)

## K Neighbors Classifier (KNN)

In [156]:
from sklearn.neighbors import KNeighborsClassifier
clf = LogisticRegression(max_iter=10000)
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

              precision    recall  f1-score   support

    negative       0.80      0.77      0.79       123
    positive       0.62      0.65      0.63        69

    accuracy                           0.73       192
   macro avg       0.71      0.71      0.71       192
weighted avg       0.73      0.73      0.73       192

Training Score: 0.7795138888888888
Testing Score: 0.7291666666666666


## Support Vector Classifier

In [157]:
from sklearn.svm import SVC

clf = SVC()
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

              precision    recall  f1-score   support

    negative       0.77      0.82      0.80       123
    positive       0.64      0.57      0.60        69

    accuracy                           0.73       192
   macro avg       0.71      0.69      0.70       192
weighted avg       0.72      0.73      0.73       192

Training Score: 0.8315972222222222
Testing Score: 0.7291666666666666


In [158]:
confusion_matrix(y_test, y_pred)

array([[101,  22],
       [ 30,  39]], dtype=int64)

In [159]:
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

    negative       0.77      0.82      0.80       123
    positive       0.64      0.57      0.60        69

    accuracy                           0.73       192
   macro avg       0.71      0.69      0.70       192
weighted avg       0.72      0.73      0.73       192



## Decision Tree Classifier

In [160]:
from sklearn.tree import DecisionTreeClassifier

clf =DecisionTreeClassifier(criterion='entropy', random_state=45  , max_depth=4)
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

              precision    recall  f1-score   support

    negative       0.83      0.64      0.72       123
    positive       0.55      0.77      0.64        69

    accuracy                           0.69       192
   macro avg       0.69      0.71      0.68       192
weighted avg       0.73      0.69      0.69       192

Training Score: 0.7847222222222222
Testing Score: 0.6875


In [161]:
dt_cm= confusion_matrix(y_test, y_pred)
dt_cm

array([[79, 44],
       [16, 53]], dtype=int64)