In [1]:
import pandas as pd
import numpy as np
from imblearn.combine import SMOTETomek ##For upsampling
from sklearn.model_selection import train_test_split,cross_val_score,RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report
import pickle #for serialization
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset=pd.read_csv("diabetes.csv")##read our dataset


In [3]:
##Lets separate features into dependant and independent feature
X=dataset.drop("Outcome",axis=1)
y=dataset["Outcome"]
print(X.shape,y.shape)

(768, 8) (768,)


In [4]:
##Lets see the count of our target feature
y.value_counts()##returns count of unique class in that feature

0    500
1    268
Name: Outcome, dtype: int64

### As there are only 39% of data for class 1 records in the dataset the model might get baised

 <b>The problem caused by imbalanced dataset is it sort of creates bias towards one class
 thus leading to wrong predictions</b>

# Upsampling
###  It is nothing but adding more copies to our minority class.Good option when we have small amount of data

In [5]:
smote=SMOTETomek(random_state=42,n_jobs=-1)##Library used to do upsampling
X_,Y_=smote.fit_resample(X,y)

In [6]:
print(X_.shape,Y_.shape)##Dimensions of data increased

(952, 8) (952,)


In [7]:
Y_.value_counts()

0    476
1    476
Name: Outcome, dtype: int64

### As we can see that now our dataset is balanced.Now we can create our model

In [8]:
##Model

In [9]:
x_train,x_test,y_train,y_test=train_test_split(X_,Y_,test_size=0.25,random_state=42)
print(x_train.shape,y_train.shape)

(714, 8) (714,)


In [10]:
col=["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]
for feature in col:
    x_train[feature].replace(0,x_train[feature].median(),inplace=True)

In [11]:
rf=RandomForestClassifier(n_jobs=-1)
rf.fit(x_train,y_train)

RandomForestClassifier(n_jobs=-1)

In [12]:
score=cross_val_score(rf,x_train,y_train,cv=10,n_jobs=-1)

In [13]:
##Lets see the models training accuracy
score

array([0.88888889, 0.77777778, 0.90277778, 0.88888889, 0.78873239,
       0.77464789, 0.78873239, 0.88732394, 0.8028169 , 0.76056338])

In [14]:
score.shape

(10,)

In [15]:
score.mean()

0.8261150234741784

### Mean training accuracy is   82.6%

In [16]:
score_test=cross_val_score(rf,x_test,y_test,cv=10,n_jobs=-1)
score_test

array([0.79166667, 0.875     , 0.75      , 0.875     , 0.875     ,
       0.95833333, 0.83333333, 0.75      , 0.7826087 , 0.7826087 ])

In [17]:
score_test.mean()

0.8273550724637682

### Mean test accuracy is 82.73 %

In [18]:
print("The maximum accuracy that our model can get is {} and minimum accuracy the model can get is {}".format(np.round(score_test.max(),2),np.round(score_test.min(),2)))

The maximum accuracy that our model can get is 0.96 and minimum accuracy the model can get is 0.75


In [19]:
pred=rf.predict(x_test)
confusion_matrix(y_test,pred)

array([[99, 20],
       [20, 99]])

In [20]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       119
           1       0.83      0.83      0.83       119

    accuracy                           0.83       238
   macro avg       0.83      0.83      0.83       238
weighted avg       0.83      0.83      0.83       238



In [None]:
##hyperparmeters of random forest
RandomForestClassifier()

## Hyperparmeter Tunning

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

In [None]:
rs=RandomizedSearchCV(rf,random_grid,cv=10,verbose=,n_jobs=-1)
rs.fit(x_train,y_train)

In [None]:
rs.best_params_

In [None]:
pred=rs.predict(x_test)
confusion_matrix(y_test,pred)

In [None]:
print(classification_report(pred,y_test))

In [None]:
metrics.plot_roc_curve(rs,x_test,y_test)

In [None]:
model_file=open("model.pkl","wb")##to serialize
pickle.dump(rs,model_file)
model_file.close()##always remember to close it

In [None]:
##Test

In [None]:
model=pickle.load(open("model.pkl","rb"))
pred=model.predict(X[:50])

In [None]:
confusion_matrix(y[:50],pred)