In [48]:
import pandas as pd
import numpy as np
import joblib

from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Load data

In [34]:
X_train = pd.read_csv('../data/preprocessed/X_train.csv')
X_test = pd.read_csv('../data/preprocessed/X_test.csv')
y_train = pd.read_csv('../data/preprocessed/y_train.csv')
y_test = pd.read_csv('../data/preprocessed/y_test.csv')

In [35]:
# instantiating random forest
rforest = RandomForestClassifier(random_state = 75)

In [36]:
# range for n estimators varying from 20 to 2000 
n_estimators = np.arange(20, 2000, step = 20)
# criterions
criterion = ["gini", "entropy"]
# max_features types
max_features = ["auto", "sqrt", "log2"]
# varying depth of trees from 2 to 200
max_depth = list(np.arange(2, 200, step = 1))
# variying the min samples split from 2 to 100
min_samples_split = np.arange(2, 100, step = 2)
# varying the minime leaves' samples
min_samples_leaf = [1, 2, 4, 6, 8, 10]
# bootstrap true or false
bootstrap = [True, False]

In [37]:
# dictionary with parameters to vary
param_grid = {
    "n_estimators": n_estimators,
    "criterion": criterion,
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "bootstrap": bootstrap,
}

In [38]:
# hyperparameters on randomized search cv
random_cv = RandomizedSearchCV(rforest, 
                               param_grid, 
                               n_iter = 10, 
                               cv = 10, 
                               scoring = "recall", 
                               n_jobs = -1, 
                               random_state = 75
)

In [39]:
# fitting
rcv = random_cv.fit(X_train, y_train)

  self.best_estimator_.fit(X, y, **fit_params)


In [40]:
# creating dataframe with hyperparameters' tests
tuning = pd.DataFrame(rcv.cv_results_)

In [41]:
# instanntiating random forest
rfn = RandomForestClassifier(**rcv.best_params_, random_state = 75)

# training model
model = rfn.fit(X_train, y_train)
# predicted values
y_pred = rfn.predict(X_test)

  model = rfn.fit(X_train, y_train)


In [42]:
# analyzing accuracy
print('Accuracy: {}'.format(metrics.accuracy_score(y_test, y_pred)))
# analyzing precision
print('Precision: {}'.format(metrics.precision_score(y_test, y_pred)))
# analyzing recall
print('Recall: {}'.format(metrics.recall_score(y_test, y_pred)))
# analyzing F1 score
print('F1 Score: {}'.format(metrics.f1_score(y_test, y_pred)))

Accuracy: 0.9993603306598435
Precision: 0.9
Recall: 0.42857142857142855
F1 Score: 0.5806451612903225


In [49]:
joblib.dump(model, '../model/model_random_forest_1.0.pkl')

['../model/model_random_forest_1.0.pkl']