In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.feature_selection import SelectFromModel

In [None]:
#currently shows for the three class case. Can be changed by removing the "Undecided" class name and mapping all
# -1's to 0's
class_names = ['Not Staying','Undecided', "Staying"]
X_train= pd.read_excel('X_train.xlsx')
X_test=pd.read_excel('X_test.xlsx')
X_val=pd.read_excel('X_val.xlsx')
y_train=pd.read_excel('y_train.xlsx')
y_test=pd.read_excel('y_test.xlsx')
y_val=pd.read_excel('y_val.xlsx')

#y dataframes ending in 0 are for predicting post-ADSC retention
#y dataframes ending in 1 are for predicting retention until retirement
y_train0 = y_train['intention_beyond_commitment']
y_train1 = y_train['intention_toward_retirement']
y_val0 = y_val['intention_beyond_commitment']
y_val1 = y_val['intention_toward_retirement']
y_test0 = y_test['intention_beyond_commitment']
y_test1 = y_test['intention_toward_retirement']


In [None]:
#Normalizing our data based on the distribution of the training data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns = X_val.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [None]:
#creating a Random Forest for each combination of estimators and depths for each type of retention of interest
#Outputs an excel sheet of the results
estimators = [1, 10, 100, 200, 500, 1000, 2000, 5000]
depths = [1,2,3,4,5,6,7,8,9,10, 15, 20, 25, 50, 75]
accs0 = []
accs1=[]
names=[]
for n_estimators in estimators:
    for max_depth in depths:
        clf0 = RandomForestClassifier(n_estimators=n_estimators, max_depth = max_depth)
        clf0.fit(X_train, y_train0)
        y_pred = clf0.predict(X_val)
        accuracy0 = accuracy_score(y_val0, y_pred)
        clf1 = RandomForestClassifier(n_estimators=n_estimators, max_depth = max_depth)
        clf1.fit(X_train, y_train1)
        y_pred = clf1.predict(X_val)
        accuracy1 = accuracy_score(y_val1, y_pred)
        accs0.append(accuracy0)
        accs1.append(accuracy1)
        names.append(str(n_estimators) + str(max_depth))
res_df = pd.DataFrame({"Params": names, 'Accuracy0': accs0, 'Accuracy1': accs1})
print(res_df)
res_df.to_excel("results_of_RF.xlsx")