In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [44]:
df = pd.read_csv('model.csv')

In [45]:
df.rename(columns={'Unnamed: 0':'id'},inplace=True)

In [46]:
df.columns

Index(['id', 'Pregnancies', 'Outcome', 'Glucose_log', 'BloodPressure_exp',
       'SkinThickness_Boxcox', 'Insulin_log', 'BMI_log',
       'DiabetesPedigreeFunction_log', 'Age_boxcox'],
      dtype='object')

### scaling

In [47]:
dfc = df.copy()
y = dfc['Outcome']
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X =  pd.DataFrame(sc_X.fit_transform(dfc.drop(["Outcome","id"],axis = 1),),
        columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'])

In [49]:
y

0       1
1       0
2       1
3       1
4       0
       ..
1995    0
1996    1
1997    0
1998    1
1999    0
Name: Outcome, Length: 2000, dtype: int64

In [23]:
#X = df.iloc[:,2:]
#y = df['Outcome']

In [50]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
# handle imbalance dataset

In [51]:
df.isnull().sum()

id                              0
Pregnancies                     0
Outcome                         0
Glucose_log                     0
BloodPressure_exp               0
SkinThickness_Boxcox            0
Insulin_log                     0
BMI_log                         0
DiabetesPedigreeFunction_log    0
Age_boxcox                      0
dtype: int64

In [52]:
from imblearn.combine import SMOTETomek
from collections import Counter

os=SMOTETomek(1)
X_train_os,y_train_os=os.fit_sample(X_train,y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_os)))


The number of classes before fit Counter({0: 927, 1: 473})
The number of classes after fit Counter({1: 925, 0: 925})




In [53]:
X_train = X_train_os
y_train = y_train_os

In [54]:
from sklearn.ensemble import RandomForestClassifier  # import model_selection

rf = RandomForestClassifier() 

from pprint import pprint
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [55]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in range(100,2000,2)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]


In [56]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [100,
               102,
               104,
               106,
               108,
               110,
               112,
               114,
               116,
               118,
               120,
               122,
               124,
               126,
               128,
               130,
               132,
               134,
               136,
               138,
               140,
               142,
               144,
               146,
               148,
               150,
               152,
               154,
               156,
               158,
               160,
               162,
               164,
               166,
               168,
               170,
               172,
               174,
               176,
               178,
               180,
               182,
               184,
               186,
               188,
               190,
               192,
               194,
           

In [57]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 200, cv = 5, verbose=2, random_state=42, n_jobs = -1)

In [58]:
# Fit the random search model
rf_random.fit(X_train,y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 16.5min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 25.4min finished


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=200,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [100, 102, 104, 106, 108,
                                                      110, 112, 114, 116, 118,
                                                      120, 122, 124, 126, 128,
                                                      130, 132, 134, 136, 138,
                                                      140, 142, 144, 146, 148,
                                                      150, 152, 154, 156, 158, ...],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 294, 389, 484,
                                                      

In [60]:
rf_random.best_estimator_

RandomForestClassifier(max_depth=234, max_features='sqrt', n_estimators=200)

In [61]:
rf_random.best_score_

0.9540540540540541

In [62]:
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 234,
 'bootstrap': True}

In [63]:
y_pred=rf_random.best_estimator_.predict(X_test)

In [64]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test,y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

MAE: 0.035
MSE: 0.035
RMSE: 0.18708286933869708
[[373  16]
 [  5 206]]
              precision    recall  f1-score   support

           0       0.99      0.96      0.97       389
           1       0.93      0.98      0.95       211

    accuracy                           0.96       600
   macro avg       0.96      0.97      0.96       600
weighted avg       0.97      0.96      0.97       600

