# 06. Random Forest Modeling
___


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_curve, roc_auc_score

from sklearn.model_selection import GridSearchCV




In [2]:
heart22 = pd.read_csv('~/Desktop/capstone-project-Tasnimacj/data/cleaned_data/heart22_preprocessed.csv',index_col=0)

In [3]:
y = heart22['HadAngina'] # Target Variable
X = heart22.drop('HadAngina', axis=1) 

In [4]:
print('Shape of y:', y.shape)
print('Shape of X:', X.shape)

Shape of y: (246013,)
Shape of X: (246013, 41)


In [5]:
#1st split

X_rem, X_test, y_rem, y_test = train_test_split(X, y, test_size=0.2, random_state=25, stratify=y)

print(f'The remainder set has {len(X_rem)} data points.')
print(f'The test set has {len(X_test)} data points.')

The remainder set has 196810 data points.
The test set has 49203 data points.


In [6]:

random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_rem, y_rem)


RandomForestClassifier()

In [7]:
print(f'Accuracy on remainder set: {random_forest_model.score(X_rem, y_rem)}')
print(f'Accuracy on test set: {random_forest_model.score(X_test, y_test)}')


Accuracy on remainder set: 0.9999237843605507
Accuracy on test set: 0.943946507326789


In [8]:
y_test_pred = random_forest_model.predict(X_test)

conmat = pd.DataFrame(
    data = confusion_matrix(y_test, y_test_pred),
    index = ['true 0', 'true 1'],
    columns = ['predicted 0', 'predicted 1']
)
display(conmat)

print(f'Recall score: {recall_score(y_test, y_test_pred)*100:0.2f}%')
print(f'Precision score: {precision_score(y_test, y_test_pred)*100:0.2f}%')
print(f'F1 score: {f1_score(y_test, y_test_pred)*100:0.2f}%')

Unnamed: 0,predicted 0,predicted 1
true 0,45766,446
true 1,2312,679


Recall score: 22.70%
Precision score: 60.36%
F1 score: 32.99%


In [9]:
from tempfile import mkdtemp
cachedir = mkdtemp()

In [10]:
# gridsearch

pipe = Pipeline([("scaler", StandardScaler()),
                 ("model", RandomForestClassifier())], memory=cachedir)
                 
param =  {'scaler': [ StandardScaler()],
          'model__max_depth' : [4, 8, 12],
          'model__n_estimators' : [64, 128] }


grid = GridSearchCV(estimator=pipe,param_grid=param, cv=5,verbose=1,refit=True)

In [11]:
fittedgrid = grid.fit(X_rem,y_rem)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [12]:
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'scaler', 'model', 'scaler__copy', 'scaler__with_mean', 'scaler__with_std', 'model__bootstrap', 'model__ccp_alpha', 'model__class_weight', 'model__criterion', 'model__max_depth', 'model__max_features', 'model__max_leaf_nodes', 'model__max_samples', 'model__min_impurity_decrease', 'model__min_impurity_split', 'model__min_samples_leaf', 'model__min_samples_split', 'model__min_weight_fraction_leaf', 'model__n_estimators', 'model__n_jobs', 'model__oob_score', 'model__random_state', 'model__verbose', 'model__warm_start'])

In [13]:
fittedgrid.best_params_

{'model__max_depth': 12,
 'model__n_estimators': 128,
 'scaler': StandardScaler()}

In [14]:
print(f"Best accuracy on the remainder set: {fittedgrid.score(X_rem, y_rem)}")
print(f"Best accuracy on the test set: {fittedgrid.score(X_test, y_test)}")


y_test_pred = fittedgrid.predict(X_test)


conmat = pd.DataFrame(
    data = confusion_matrix(y_test, y_test_pred),
    index = ['true 0', 'true 1'],
    columns = ['predicted 0', 'predicted 1']
)
display(conmat)

print(f'Recall score: {recall_score(y_test, y_test_pred)*100:0.2f}%')
print(f'Precision score: {precision_score(y_test, y_test_pred)*100:0.2f}%')
print(f'F1 score: {f1_score(y_test, y_test_pred)*100:0.2f}%')

Best accuracy on the remainder set: 0.9568060566028149
Best accuracy on the test set: 0.9442107188586062


Unnamed: 0,predicted 0,predicted 1
true 0,45873,339
true 1,2406,585


Recall score: 19.56%
Precision score: 63.31%
F1 score: 29.89%


In [15]:
# SMOTE
from imblearn.over_sampling import SMOTE

X_rem_sm, y_rem_sm = SMOTE(random_state=1).fit_resample(X_rem, y_rem)

In [16]:

print('Original class distribution')
print((y_rem).value_counts().sort_index(),'\n')

print('Resampled class distribution')
print((y_rem_sm).value_counts().sort_index())

Original class distribution
HadAngina
0    184848
1     11962
Name: count, dtype: int64 

Resampled class distribution
HadAngina
0    184848
1    184848
Name: count, dtype: int64


In [17]:
# gridsearch

pipe = Pipeline([("scaler", StandardScaler()),
                 ("model", RandomForestClassifier())], memory=cachedir)
                 
param =  {'scaler': [ StandardScaler()],
          'model__max_depth' : [4, 8, 12],
          'model__n_estimators' : [64, 128] }


grid = GridSearchCV(estimator=pipe,param_grid=param, cv=5,verbose=1,refit=True)

In [18]:
fittedgrid_sm = grid.fit(X_rem_sm,y_rem_sm)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [19]:
fittedgrid_sm.best_params_

{'model__max_depth': 12,
 'model__n_estimators': 128,
 'scaler': StandardScaler()}

In [20]:
fittedgrid_sm.best_estimator_

Pipeline(memory='/var/folders/r3/bz5mjtds4dvdw0hskxwvs9vc0000gp/T/tmpyvthk1ol',
         steps=[('scaler', StandardScaler()),
                ('model',
                 RandomForestClassifier(max_depth=12, n_estimators=128))])

In [21]:
print(f"Best accuracy on the remainder set: {fittedgrid.score(X_rem, y_rem)}")
print(f"Best accuracy on the test set: {fittedgrid.score(X_test, y_test)}")


y_test_pred = fittedgrid.predict(X_test)


conmat = pd.DataFrame(
    data = confusion_matrix(y_test, y_test_pred),
    index = ['true 0', 'true 1'],
    columns = ['predicted 0', 'predicted 1']
)
display(conmat)

print(f'Recall score: {recall_score(y_test, y_test_pred)*100:0.2f}%')
print(f'Precision score: {precision_score(y_test, y_test_pred)*100:0.2f}%')
print(f'F1 score: {f1_score(y_test, y_test_pred)*100:0.2f}%')

Best accuracy on the remainder set: 0.9568060566028149
Best accuracy on the test set: 0.9442107188586062


Unnamed: 0,predicted 0,predicted 1
true 0,45873,339
true 1,2406,585


Recall score: 19.56%
Precision score: 63.31%
F1 score: 29.89%
