# Random Forest Model

In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV

In [2]:
start_time = time.time()

# loading data and features:

fiducial_pt_full = pd.read_pickle('../output/fiducial_pt_full.pkl')
label_full = pd.read_pickle('../output/label_full.pkl')

# Randomly splitting data into training & test sets:
RSEED = 42

x_train, x_test, train_labels, test_labels = train_test_split(fiducial_pt_full, 
                                                          label_full, 
                                                          test_size=0.2, 
                                                          random_state=RSEED)

# Using pairwise distance as feature.
# extracting pairwise distance as features (78*77/2=3303 features)
# nrow=number of records of the dataset; ncol=3303

feature_train = np.stack((metrics.pairwise_distances(x_train[i])[np.triu_indices(78, k = 1)] for i in range(x_train.shape[0])))
feature_test = np.stack((metrics.pairwise_distances(x_test[i])[np.triu_indices(78, k = 1)] for i in range(x_test.shape[0])))


  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):


In [10]:
model = RandomForestClassifier(n_estimators=100, 
                               bootstrap = True,
                               max_features = 'sqrt')
start = time.time()
# Fit on training data
model.fit(feature_train, train_labels)
print('Training takes {:.2f}'.format(time.time()-start))

Training takes 8.38


In [11]:
# Make probability predictions
start = time.time()

train_probs = model.predict_proba(feature_train)[:, 1]
probs = model.predict_proba(feature_test)[:, 1]

train_predictions = model.predict(feature_train)
predictions = model.predict(feature_test)

print('Testing takes {:.2f}'.format(time.time()-start))

Testing takes 0.23


In [5]:
print(f'Train ROC AUC Score: {metrics.roc_auc_score(train_labels, train_probs)}')
print(f'Test ROC AUC  Score: {metrics.roc_auc_score(test_labels, probs)}')

Train ROC AUC Score: 0.9999999999999999
Test ROC AUC  Score: 0.8125594968710498


In [8]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score
def clf_metrics(y_true, y_pred, y_score):
    accuracy = accuracy_score(y_true, y_pred)
    
    # reweight dataset in order to estimate the accuracy of "balanced" data set
    weight_data = np.zeros(len(y_true))
    for v in np.unique(y_true):
        weight_data[y_true==v] = 0.5*len(y_true)/np.sum(y_true==v)
    weighted_acc = np.sum(weight_data * (y_pred==y_true)/np.sum(weight_data))
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_score, average='weighted')
    
    df = pd.DataFrame({'accuracy':[accuracy],'weighted acc':[weighted_acc],
                       'precision': [precision], 'recall': [recall], 'auc':[auc]})
    print(df)

In [9]:
clf_metrics(test_labels, predictions, probs)

   accuracy  weighted acc  precision    recall       auc
0  0.801667       0.57948   0.884615  0.165468  0.812559


In [22]:
end_time = time.time()
print('Run time is {:.2f} s'.format(end_time - start_time))

Run time is 15.57 s


## Optimizing parameters

In [75]:
# tuning the parameters using the entire 2400 takes too long. So, picking randomly 100 points and sampling.
# issue - unequale data!

some_random_sample = np.random.randint(1,high = len(feature_train), size = 100, )

randomized_feature_train_sample = feature_train[some_random_sample]
randomized_train_labels_sample = np.array(train_labels.sort_values())[some_random_sample]

In [76]:
# Hyperparameter grid
param_grid = {
    'n_estimators': np.linspace(10, 200).astype(int),
    'max_depth': [None] + list(np.linspace(3, 20).astype(int)),
    'max_features': ['auto', 'sqrt', None] + list(np.arange(0.5, 1, 0.1)),
    'max_leaf_nodes': [None] + list(np.linspace(10, 50, 500).astype(int)),
    'min_samples_split': [2, 5, 10],
    'bootstrap': [True, False]
}

# Estimator for use in random search
estimator = RandomForestClassifier(random_state = RSEED)

# Create the random search model
rs = RandomizedSearchCV(estimator, param_grid, n_jobs = -1, 
                        scoring = 'roc_auc', cv = 3, 
                        n_iter = 10, verbose = 1, random_state=RSEED)

# Fit 
rs.fit(randomized_feature_train_sample, randomized_train_labels_sample)


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   51.3s finished


56.804641008377075


In [77]:
# inspecting best parameters:

rs.best_params_

{'n_estimators': 48,
 'min_samples_split': 10,
 'max_leaf_nodes': 36,
 'max_features': 0.6,
 'max_depth': 13,
 'bootstrap': False}

In [78]:
# retrainning and testing the model using the optimized paramaeters:

best_model = rs.best_estimator_

train_rf_predictions = best_model.predict(feature_train)
train_rf_probs = best_model.predict_proba(feature_train)[:, 1]

rf_predictions = best_model.predict(feature_test)
rf_probs = best_model.predict_proba(feature_test)[:, 1]

In [79]:
print(f'Optimized Train ROC AUC Score: {metrics.roc_auc_score(train_labels, train_rf_probs)}')
print(f'Optimized Test ROC AUC  Score: {metrics.roc_auc_score(test_labels, rf_probs)}')

Optimized Train ROC AUC Score: 0.5175246010018868
Optimized Test ROC AUC  Score: 0.5219338628879976


In [None]:
# lower AUC than before - likely because 100 isn't big enough sample to tune.