## Random Forest

In [1]:
import pandas as pd
import numpy as np
from scipy.io import arff
import sklearn.model_selection
import sklearn.preprocessing
import sklearn.metrics
import sklearn.ensemble

Importing the 'Adult' dataset

In [2]:
adult = pd.read_csv('adult_cleaned.csv')
adult.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours/wk,salary,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
0,39,77516,13,2174,0,40,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Converting pandas series to numpy arrays
adult_Y = adult['salary'].to_numpy()

adult_X = adult.iloc[:, adult.columns != 'salary'].to_numpy()
#adult_X

In [4]:
# Scaling the data using a Standard scaler
std_scaler = sklearn.preprocessing.StandardScaler()

In [5]:
adult_X_std = std_scaler.fit_transform(adult_X)

Using the 'Gamma' dataset now

In [6]:
gamma = pd.read_csv('gamma_cleaned.csv')
gamma.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,1
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,1
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,1
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,1
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,1


In [7]:
# Converting pandas series to numpy arrays
gamma_Y = gamma['class'].to_numpy()

gamma_X = gamma.iloc[:, gamma.columns != 'class'].to_numpy()

In [8]:
# Scaling the gamma data using a Standard scaler
gamma_X_std = std_scaler.fit_transform(gamma_X)

Using the 'Eye' dataset now

In [9]:
eye_df = pd.read_csv('eye_cleaned.csv')
eye_df.head()

Unnamed: 0,AF3,F7,F3,FC5,T7,P7,O1,O2,P8,T8,FC6,F4,F8,AF4,eyeDetection
0,4329.23,4009.23,4289.23,4148.21,4350.26,4586.15,4096.92,4641.03,4222.05,4238.46,4211.28,4280.51,4635.9,4393.85,-1
1,4324.62,4004.62,4293.85,4148.72,4342.05,4586.67,4097.44,4638.97,4210.77,4226.67,4207.69,4279.49,4632.82,4384.1,-1
2,4327.69,4006.67,4295.38,4156.41,4336.92,4583.59,4096.92,4630.26,4207.69,4222.05,4206.67,4282.05,4628.72,4389.23,-1
3,4328.72,4011.79,4296.41,4155.9,4343.59,4582.56,4097.44,4630.77,4217.44,4235.38,4210.77,4287.69,4632.31,4396.41,-1
4,4326.15,4011.79,4292.31,4151.28,4347.69,4586.67,4095.9,4627.69,4210.77,4244.1,4212.82,4288.21,4632.82,4398.46,-1


In [10]:
# Converting pandas series to numpy arrays
eye_Y = eye_df['eyeDetection'].to_numpy()
eye_X = eye_df.iloc[:, eye_df.columns != 'eyeDetection'].to_numpy()

In [11]:
# Scaling the eye data arrays using a Standard scaler
eye_X_std = std_scaler.fit_transform(eye_X)

Using the 'Occupancy' dataset now:

In [12]:
occupancy_df = pd.read_csv('occupancy_cleaned.csv')
occupancy_df.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,23.18,27.272,426.0,721.25,0.004793,1
1,23.15,27.2675,429.5,714.0,0.004783,1
2,23.15,27.245,426.0,713.5,0.004779,1
3,23.15,27.2,426.0,708.25,0.004772,1
4,23.1,27.2,426.0,704.5,0.004757,1


In [13]:
# Converting pandas series to numpy arrays
occupancy_Y = occupancy_df['Occupancy'].to_numpy()
occupancy_X = occupancy_df.iloc[:, occupancy_df.columns != 'Occupancy'].to_numpy()

In [14]:
occupancy_X_std = std_scaler.fit_transform(occupancy_X)

Running 5 trials of Random Forest on each dataset:

In [15]:
# List of all hyper parameters
max_features_split = [1, 2, 4, 6, 8, 12, 16, 20]

In [16]:
# This function can be called on a given dataset and it will run
# Random Forest classification on it
def random_forest_solver(X_data, Y_data):

    # These lists will store the training and testing
    # performance metrics across all 5 trials
    trial_scores_test = []
    trial_scores_train = []

    for i in range(5):
        # Splitting data into training (5000 samples) and testing (remaining data points)
        # Note: we are shuffling the data before sampling for randomness

        X_train, data_X_test, Y_train, data_Y_test = sklearn.model_selection.train_test_split(
            X_data, Y_data, train_size = 5000, shuffle = True, stratify = Y_data)

        # Creating a Random Forest Classifier object with 1024 trees
        rf_classifier = sklearn.ensemble.RandomForestClassifier(n_estimators = 1024)

        # This list will contain the final max_features_split hyperparameters
        # based on the data inputted
        new_list_max_features = []
        max_vars_in_data = X_train.shape[1]

        # If the length of the data is smaller than a number in
        # max_features_split, it will append values smaller than it
        # into the list new_list_max_features
        for i in max_features_split:
            if i <= max_vars_in_data:
                new_list_max_features.append(i)

        # GridSearch object to cycle through hyperparameters along with cross validation
        data_grid_search = sklearn.model_selection.GridSearchCV(estimator = rf_classifier,
                                                           cv = 5,
                                                           param_grid = {'max_features': new_list_max_features},
                                                           scoring = ['accuracy', 'f1', 'roc_auc'],
                                                           refit = 'accuracy', 
                                                                n_jobs = -1)

        # Fitting the training data to perform a 5 fold cross validation
        data_grid_search.fit(X_train, Y_train)
        data_grid_results = pd.DataFrame(data_grid_search.cv_results_)
        
        # Obtaining the dataframe indicies corresponding to the best accuracy, f1, and roc_auc
        data_ind_best_accuracy = data_grid_results['mean_test_accuracy'].idxmax()
        data_ind_best_roc_auc = data_grid_results['mean_test_roc_auc'].idxmax()
        data_ind_best_f1 = data_grid_results['mean_test_f1'].idxmax()

        # Obtaining the corresponding hyperparameters from the index value above
        data_accuracy_best_param_features = data_grid_results['param_max_features'][data_ind_best_accuracy]
        data_f1_best_param_features = data_grid_results['param_max_features'][data_ind_best_f1]
        data_roc_auc_best_param_features = data_grid_results['param_max_features'][data_ind_best_roc_auc]

        # Creating a list of best hyperparams per metric 
        list_best_param_accuracy = [data_accuracy_best_param_features]
        list_best_param_f1 = [data_f1_best_param_features]
        list_best_param_roc_auc = [data_roc_auc_best_param_features]

        # This is used to create a dataframe to cycle through the parameters while re-training
        overall_best_params = [list_best_param_accuracy, list_best_param_f1, list_best_param_roc_auc]

        # The row indicies are ['accuracy', 'f1', 'roc_auc']
        best_hyperparam = pd.DataFrame(overall_best_params, columns = ['max_features'])
        
        # These training and testing lists store the performance of each
        # model (3 models as there are 3 metrics)
        model_scores_test = []
        model_scores_train = []

        for i in range(len(best_hyperparam)):

            # Stores the training and testing scores of a model in the loop
            test_scores = []
            train_scores = []

            # Creating model with the best hyperparameter
            rf_final = sklearn.ensemble.RandomForestClassifier(n_estimators = 1024,
                                                                        max_features = best_hyperparam['max_features'][i])
            
            # Fitting the corresponding model once more with the whole training data
            rf_final.fit(X_train, Y_train)

            # Test data performance
            data_Y_pred = rf_final.predict(data_X_test)

            test_scores.append(sklearn.metrics.accuracy_score(data_Y_test, data_Y_pred))
            test_scores.append(sklearn.metrics.f1_score(data_Y_test, data_Y_pred))
            test_scores.append(sklearn.metrics.roc_auc_score(data_Y_test, data_Y_pred))

            model_scores_test.append(test_scores)

            # Training data performance
            data_Y_pred_train = rf_final.predict(X_train)

            train_scores.append(sklearn.metrics.accuracy_score(Y_train, data_Y_pred_train))
            train_scores.append(sklearn.metrics.f1_score(Y_train, data_Y_pred_train))
            train_scores.append(sklearn.metrics.roc_auc_score(Y_train, data_Y_pred_train))

            model_scores_train.append(train_scores)

        # Storing the performance of each testing data model
        model_df_test = pd.DataFrame(model_scores_test, columns = ['accuracy', 'f1', 'roc_auc'])
        model_mean_test = model_df_test.mean().to_numpy()

        # Storing the performance of each training data model
        model_df_train = pd.DataFrame(model_scores_train, columns = ['accuracy', 'f1', 'roc_auc'])
        model_mean_train = model_df_train.mean().to_numpy()

        trial_scores_test.append(model_mean_test)
        trial_scores_train.append(model_mean_train)

    # These datasets contain all the 3 metrics performances of each trial in a dataframe format
    RF_data_trial_test = pd.DataFrame(trial_scores_test, columns = ['accuracy', 'f1', 'roc_auc'])
    RF_data_trial_train = pd.DataFrame(trial_scores_train, columns = ['accuracy', 'f1', 'roc_auc'])
    
    return RF_data_trial_test, RF_data_trial_train


Now we can call the above Random Forest classification function on each dataset to obtain its testing and training data performances.

In [17]:
raw_test_occupancy, raw_train_occupancy = random_forest_solver(occupancy_X_std, occupancy_Y)
raw_test_occupancy

Unnamed: 0,accuracy,f1,roc_auc
0,0.991003,0.980704,0.990517
1,0.991302,0.981333,0.99068
2,0.991624,0.981877,0.988294
3,0.99051,0.979659,0.990035
4,0.991517,0.981719,0.989554


In [18]:
raw_test_adult, raw_train_adult = random_forest_solver(adult_X_std, adult_Y)
raw_test_adult

Unnamed: 0,accuracy,f1,roc_auc
0,0.851094,0.903683,0.776796
1,0.84986,0.902442,0.780082
2,0.854505,0.905966,0.780569
3,0.852993,0.905655,0.77076
4,0.851844,0.90457,0.773209


In [19]:
raw_test_gamma, raw_train_gamma = random_forest_solver(gamma_X_std, gamma_Y)
raw_test_gamma

Unnamed: 0,accuracy,f1,roc_auc
0,0.869662,0.903544,0.839334
1,0.872658,0.9053,0.844754
2,0.868569,0.90246,0.839372
3,0.870827,0.904279,0.841191
4,0.873704,0.90541,0.848995


In [20]:
raw_test_eye, raw_train_eye = random_forest_solver(eye_X_std, eye_Y)
raw_test_eye

Unnamed: 0,accuracy,f1,roc_auc
0,0.894723,0.878918,0.890697
1,0.910922,0.898612,0.908011
2,0.895858,0.879761,0.891499
3,0.897929,0.881761,0.893294
4,0.901002,0.886896,0.897644


In [21]:
# Obtaining the mean across all 5 trials per dataset for the testing data

mean_adult_rf_score = raw_test_adult.mean().to_numpy()
mean_gamma_rf_score = raw_test_gamma.mean().to_numpy()
mean_eye_rf_score = raw_test_eye.mean().to_numpy()
mean_occupancy_rf_score = raw_test_occupancy.mean().to_numpy()

In [22]:
# Obtaining the mean across all 5 trials per dataset for the training data

mean_adult_rf_score_train = raw_train_adult.mean().to_numpy()
mean_gamma_rf_score_train = raw_train_gamma.mean().to_numpy()
mean_eye_rf_score_train = raw_train_eye.mean().to_numpy()
mean_occupancy_rf_score_train = raw_train_occupancy.mean().to_numpy()

In [23]:
# Saving the average metrics for both testing and training so we don't have to rerun the code

list_of_metrics_across_datasets = [mean_adult_rf_score, mean_gamma_rf_score,
                                   mean_eye_rf_score, mean_occupancy_rf_score]

list_of_metrics_across_datasets_train = [mean_adult_rf_score_train, mean_gamma_rf_score_train,
                                   mean_eye_rf_score_train, mean_occupancy_rf_score_train]

In [24]:
pd.DataFrame(list_of_metrics_across_datasets).to_csv('RF_metrics.csv', index = False)
pd.DataFrame(list_of_metrics_across_datasets_train).to_csv('RF_metrics_train.csv', index = False)

In [25]:
# Also saving the raw test scores for later use if needed

raw_test_occupancy.to_csv('RF_occupancy_test.csv', index = False)
raw_test_adult.to_csv('RF_adult_test.csv', index = False)
raw_test_gamma.to_csv('RF_gamma_test.csv', index = False)
raw_test_eye.to_csv('RF_eye_test.csv', index = False)

In [26]:
# saving the raw train scores for later use if needed

raw_train_occupancy.to_csv('RF_occupancy_train.csv', index = False)
raw_train_adult.to_csv('RF_adult_train.csv', index = False)
raw_train_gamma.to_csv('RF_gamma_train.csv', index = False)
raw_train_eye.to_csv('RF_eye_train.csv', index = False)