In [23]:
import os
import joblib
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

# Define function to compute the performance metrics for random forest
def compute_rf_metrics(X, y, smote=False, model_name='model'):
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler=MinMaxScaler()
    X_train=scaler.fit_transform(X_train)
    X_test=scaler.transform(X_test)

    if smote:
        sm = SMOTE(random_state=27)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    
    # Train random forest model
    model = RandomForestClassifier(n_estimators=200,criterion="entropy")
    model.fit(X_train, y_train)

    # Save model with best performance
    joblib.dump(model, f'{model_name}.pkl')
    
    # Predict on test set
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    # Compute metrics
    auc = roc_auc_score(y_test, y_prob)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return auc, recall, precision, f1

def load_data(file_name):
    curr_file = os.path.join(os.getcwd().replace('models', 'data'), file_name)
    return pd.read_csv(curr_file, delimiter=",")

# Load data
data = load_data('data.csv')

# Extract features and labels for each dataset
X_data, y_data = data.drop('Outcome', axis=1), data['Outcome']

# Compute metrics for each dataset
metrics_data_rf = compute_rf_metrics(X_data, y_data, model_name='random_forrest')
metrics_data_smote_rf = compute_rf_metrics(X_data, y_data, smote=True, model_name='random_forrest_smote')

# Combine metrics into a single table for random forest
metrics_table_rf = pd.DataFrame({
    'Dataset': ['data', 'data_smote'],
    'AUC': [metrics_data_rf[0], metrics_data_smote_rf[0]],
    'Recall': [metrics_data_rf[1], metrics_data_smote_rf[1]],
    'Precision': [metrics_data_rf[2], metrics_data_smote_rf[2]],
    'F1': [metrics_data_rf[3], metrics_data_smote_rf[3]]
})

In [24]:
# Vizuallize metrics table
display(metrics_table_rf)

Unnamed: 0,Dataset,AUC,Recall,Precision,F1
0,data,0.844307,0.604651,0.764706,0.675325
1,data_smote,0.841589,0.651163,0.651163,0.651163


In [25]:
import numpy as np
from pprint import pprint

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 250, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 34, 58, 82, 106, 130, 154, 178, 202, 226, 250, None],
 'max_features': ['auto', 'log2'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 650, 1100, 1550, 2000]}


In [26]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier("log_loss", class_weight="balanced_subsample")
# Random search of parameters, using 3 fold cross validation, 
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [27]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    return accuracy

In [28]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Model Performance
Average Error: 0.4000 degrees.
Accuracy = 96.67%.


In [29]:
joblib.dump(best_random, f'best_random_forrest.pkl')

['best_random_forrest.pkl']