In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np
import pickle
import random

In [2]:
df = pd.read_csv('pima_dataset.csv')
df = df.drop(['Insulin'], axis=1)
df = df.drop(['DiabetesPedigreeFunction'], axis=1)

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,Age,Outcome
0,6,148,72,35,33.6,50,1
1,1,85,66,29,26.6,31,0
2,8,183,64,0,23.3,32,1
3,1,89,66,23,28.1,21,0
4,0,137,40,35,43.1,33,1


In [4]:
#prep X, y
y = df['Outcome']
X = df.drop(['Outcome'], axis=1)
print('X Shape: ' + str(X.shape))
print('y Shape: ' + str(y.shape))

X Shape: (768, 6)
y Shape: (768,)


In [5]:
#train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=random.seed(69))
print('X_train Shape: ' + str(X_train.shape))
print('X_test Shape: ' + str(X_test.shape))
print('y_train Shape: ' + str(y_train.shape))
print('y_test Shape: ' + str(y_test.shape))

X_train Shape: (614, 6)
X_test Shape: (154, 6)
y_train Shape: (614,)
y_test Shape: (154,)


In [6]:
#init basic random forest classifier
rf_model = RandomForestClassifier(oob_score=True, n_estimators=20, random_state=random.seed(69))
rf_model.fit(X_train, y_train)

#evaluate accuracy of vanilla classifier
rf_model.oob_score_ 

0.737785016286645

In [7]:
#generating all combinations of parameters for GridSearch
n_estimators = [int(x) for x in np.linspace(start=20, stop=150, num=5)] # number trees in random forest
max_features = ['auto', 'sqrt']
max_depth = [2, 4]
min_samples_split = [2, 5]
min_samples_leaf = [1, 2]
bootstrap = [True, False]
oob_score = [True]

In [8]:
# combiations --> hashtable
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'oob_score': oob_score
             }
print(param_grid)

{'n_estimators': [20, 52, 85, 117, 150], 'max_features': ['auto', 'sqrt'], 'max_depth': [2, 4], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False], 'oob_score': [True]}


In [9]:
rf_model = RandomForestClassifier()
rf_grid = GridSearchCV(estimator = rf_model, param_grid = param_grid, cv = 10, verbose=2, n_jobs = 4)
rf_grid.fit(X_train, y_train)

Fitting 10 folds for each of 160 candidates, totalling 1600 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    8.8s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:   31.5s
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done 1532 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done 1600 out of 1600 | elapsed:  2.2min finished


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'bootstrap': [True, False], 'max_depth': [2, 4],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [20, 52, 85, 117, 150],
                         'oob_score': [True]},
             verbose=2)

In [10]:
regressor = rf_grid.best_estimator_
regressor.fit(X_train, y_train)
regressor.oob_score_

0.762214983713355

In [11]:
trained_model = 'trained_model.sav'
pickle.dump(regressor, open(trained_model, 'wb'))

In [12]:
test=[[1, 128, 98, 41, 32, 33]]
test2=[[1, 300, 90, 51, 48.7, 31]]

In [13]:
regressor.predict_proba(test2)

array([[0.35800665, 0.64199335]])

In [14]:
rf_model = pickle.load(open('trained_model.sav', 'rb'))