## Import Libraries

In [22]:
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report


## load dataset

In [23]:
#Load diabetes dataset
df = pd.read_csv('diabetes.csv')

In [24]:
#shape of the dataset
df.shape

(768, 9)

In [25]:
#view the dataset
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [26]:
# check for null value
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [27]:
#statistical information of dataset
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [28]:
# Check the count of target class
df.Outcome.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [29]:
df.Outcome.value_counts(normalize=True)

0    0.651042
1    0.348958
Name: Outcome, dtype: float64

* class 0=> Non diabetic  => 65%
* class 1=> diabetic      => 35%

In [30]:
#info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [31]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

### Random Forest

In [32]:
X = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', ]]
y = df[['Outcome']]

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size =0.8, random_state = 123)


In [34]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(614, 8)
(154, 8)
(614, 1)
(154, 1)


### With default parameter

In [35]:
from sklearn.ensemble import RandomForestClassifier
rf =RandomForestClassifier()
rf.fit(X_train, y_train)   # training

  rf.fit(X_train, y_train)   # training


**prediction on test data**

In [36]:
# prediction on test data
print('****** prediction on test data *******')
predict_test = rf.predict(X_test)

# Actual Value : y_test
# prediction : predict_test
print('Confusion Matrix')
print(confusion_matrix(y_test, predict_test))

print('---------------------------------------------------')
print('Classification Report')
print(classification_report(y_test, predict_test))

****** prediction on test data *******
Confusion Matrix
[[85 11]
 [17 41]]
---------------------------------------------------
Classification Report
              precision    recall  f1-score   support

           0       0.83      0.89      0.86        96
           1       0.79      0.71      0.75        58

    accuracy                           0.82       154
   macro avg       0.81      0.80      0.80       154
weighted avg       0.82      0.82      0.82       154



**prediction on train data**

In [37]:
# prediction on train data
print('****** prediction on train data *******')
predict_train = rf.predict(X_train)

# Actual Value : y_train
# prediction : predict_train
print('Confusion Matrix')
print(confusion_matrix(y_train, predict_train))

print('---------------------------------------------------')
print('Classification Report')
print(classification_report(y_train, predict_train))

****** prediction on train data *******
Confusion Matrix
[[404   0]
 [  0 210]]
---------------------------------------------------
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       404
           1       1.00      1.00      1.00       210

    accuracy                           1.00       614
   macro avg       1.00      1.00      1.00       614
weighted avg       1.00      1.00      1.00       614



* training accuracy=100%
* Testing accuracy=82%
* therefore model is Overfitted

### with Different parameters

In [42]:
from sklearn.ensemble import RandomForestClassifier
rf =RandomForestClassifier(n_estimators= 200, criterion= 'entropy', max_depth=6)
rf.fit(X_train, y_train)   # training

  rf.fit(X_train, y_train)   # training


In [43]:
# prediction on test data
print('****** prediction on test data *******')
predict_test = rf.predict(X_test)

# Actual Value : y_test
# prediction : predict_test
print('Confusion Matrix')
print(confusion_matrix(y_test, predict_test))

print('---------------------------------------------------')
print('Classification Report')
print(classification_report(y_test, predict_test))

****** prediction on test data *******
Confusion Matrix
[[88  8]
 [20 38]]
---------------------------------------------------
Classification Report
              precision    recall  f1-score   support

           0       0.81      0.92      0.86        96
           1       0.83      0.66      0.73        58

    accuracy                           0.82       154
   macro avg       0.82      0.79      0.80       154
weighted avg       0.82      0.82      0.81       154



In [44]:
# prediction on train data
print('****** prediction on train data *******')
predict_train = rf.predict(X_train)

# Actual Value : y_train
# prediction : predict_train
print('Confusion Matrix')
print(confusion_matrix(y_train, predict_train))

print('---------------------------------------------------')
print('Classification Report')
print(classification_report(y_train, predict_train))

****** prediction on train data *******
Confusion Matrix
[[392  12]
 [ 62 148]]
---------------------------------------------------
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.97      0.91       404
           1       0.93      0.70      0.80       210

    accuracy                           0.88       614
   macro avg       0.89      0.84      0.86       614
weighted avg       0.88      0.88      0.87       614



* training accuracy=88%
* Testing accuracy=82%
* therefore model is Optimal

## End