In [1]:
# IMPORTING THE LIBRARIES TO BE USED

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# READING THE DATASET

df = pd.read_csv("Diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# Create X (features variable)
X = df.drop("Outcome", axis=1)

# Create Y (labelled target variable)
Y = df["Outcome"]

In [26]:
# Choosing the right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

# We'll keep the default hyperparameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [41]:
# Fit the model into the training data
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

# Essentially the X and Y is split, so that the X_train and Y_train is used to train the data
# While the X_test and Y_test is used to test the data i.e. to evaluate the data, to see what it has learnt.

In [42]:
# Fitting the model into the data

clf.fit(X_train, Y_train)

RandomForestClassifier(n_estimators=90)

In [43]:
# Make a prediction

Y_preds = clf.predict(X_test)
Y_preds

array([0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1], dtype=int64)

In [50]:
Y_test

671    0
187    1
458    1
605    0
423    0
      ..
425    1
337    1
535    1
129    1
215    1
Name: Outcome, Length: 231, dtype: int64

In [45]:
# Evaluate the model on the training data and test data
clf.score(X_train, Y_train)

1.0

In [46]:
clf.score(X_test, Y_test)

0.7272727272727273

In [34]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(Y_test, Y_preds))

              precision    recall  f1-score   support

           0       0.73      0.89      0.80        99
           1       0.67      0.40      0.50        55

    accuracy                           0.71       154
   macro avg       0.70      0.64      0.65       154
weighted avg       0.71      0.71      0.69       154



In [35]:
confusion_matrix(Y_test, Y_preds)

array([[88, 11],
       [33, 22]], dtype=int64)

In [36]:
accuracy_score(Y_test, Y_preds)

0.7142857142857143

In [37]:
# Improve a model
# Try different amount of n_estimators
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, Y_train)
    print(f"Model accuracy on test set: {clf.score(X_test, Y_test) * 100: .2f}%")
    print("")

Trying model with 10 estimators...
Model accuracy on test set:  74.68%

Trying model with 20 estimators...
Model accuracy on test set:  71.43%

Trying model with 30 estimators...
Model accuracy on test set:  72.73%

Trying model with 40 estimators...
Model accuracy on test set:  72.08%

Trying model with 50 estimators...
Model accuracy on test set:  74.03%

Trying model with 60 estimators...
Model accuracy on test set:  72.73%

Trying model with 70 estimators...
Model accuracy on test set:  72.08%

Trying model with 80 estimators...
Model accuracy on test set:  70.13%

Trying model with 90 estimators...
Model accuracy on test set:  70.13%



In [38]:
# Saving the model and load it
import pickle

pickle.dump(clf, open("Random_Forest_Model_on_Diabetes.pkl", "wb"))

In [40]:
loaded_model = pickle.load(open("Random_Forest_Model_on_Diabetes.pkl", "rb"))
loaded_model.score(X_test, Y_test)

0.7012987012987013

In [48]:
import sklearn
sklearn.show_versions()




System:
    python: 3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]
executable: C:\Program Files\Anaconda3\python.exe
   machine: Windows-10-10.0.15063-SP0

Python dependencies:
          pip: 21.2.4
   setuptools: 61.2.0
      sklearn: 1.0.2
        numpy: 1.21.5
        scipy: 1.7.3
       Cython: 0.29.28
       pandas: 1.4.2
   matplotlib: 3.5.1
       joblib: 1.1.0
threadpoolctl: 2.2.0

Built with OpenMP: True


In [59]:
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [60]:
for i in range(3):
    tree = clf.estimators_[i]
    dot_data = export_graphviz(tree,
                              feature_names=X_train.columns,
                              filled=True,
                              max_depth=2,
                              impurity=False,
                              proportion=True)
    graph=graphviz.Source(dot_data)
    display(graph)

ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

<graphviz.sources.Source at 0x20dba6d48b0>

ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

<graphviz.sources.Source at 0x20dba6d4940>

ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

<graphviz.sources.Source at 0x20db8babb50>

In [3]:
!conda remove graphbiz
!conda install python-graphviz
graphviz.Source(dot_data).view()

Collecting package metadata (repodata.json): ...working... done


PackagesNotFoundError: The following packages are missing from the target environment:
  - graphbiz





Solving environment: ...working... failed
^C


NameError: name 'graphviz' is not defined