# Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import shap
from BorutaShap import BorutaShap
import xgboost

# Loading/ Splitting Data

In [None]:
#currently shows for the three class case. Can be changed by removing the "Undecided" class name and mapping all
# -1's to 0's
class_names = ['Not Staying',"Undecided","Staying"]
X_train= pd.read_excel('X_train.xlsx')
X_test=pd.read_excel('X_test.xlsx')
X_val=pd.read_excel('X_val.xlsx')
y_train=pd.read_excel('y_train.xlsx')
y_test=pd.read_excel('y_test.xlsx')
y_val=pd.read_excel('y_val.xlsx')

#y dataframes ending in 0 are for predicting post-ADSC retention
#y dataframes ending in 1 are for predicting retention until retirement
y_train0 = y_train['intention_beyond_commitment']
y_train1 = y_train['intention_toward_retirement']
y_val0 = y_val['intention_beyond_commitment']
y_val1 = y_val['intention_toward_retirement']
y_test0 = y_test['intention_beyond_commitment']
y_test1 = y_test['intention_toward_retirement']


In [None]:
#Normalizing our data based on the distribution of the training data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns = X_val.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [None]:
#creating a RF for each case (with understanding or not) and also for both types of retention we're interested in
#parameters were determined by a hyperparameter search on baseline RF's
clf0_w_u = RandomForestClassifier(n_estimators=100, max_depth = 50)
clf1_w_u= RandomForestClassifier(n_estimators=100, max_depth = 15)
clf0_n_u= RandomForestClassifier(n_estimators=100, max_depth = 75)
clf1_n_u= RandomForestClassifier(n_estimators=100, max_depth = 50)

In [None]:
#create a feature selector and fit it
feature_selector0_w_u = BorutaShap.BorutaShap(model = clf0_w_u, importance_measure = 'shap', classification = True)


In [None]:
feature_selector0_w_u.fit(X=X_train, y=y_train0, n_trials =100)

feature_selector0_w_u.plot(X_size=12, figsize=(12,8),
            y_scale='log', which_features='all')

In [None]:
feature_selector0_w_u.TentativeRoughFix()

In [None]:
# Returns a subset of the original data with the selected features
subset = feature_selector0_w_u.Subset()
subset.head()
subset.to_excel('BS_subset0_train_with_und.xlsx')

In [None]:
feature_selector0_w_u.results_to_csv(filename='feature_importance0_with_und')


In [None]:
#create a feature selector and fit it
feature_selector1_w_u = BorutaShap.BorutaShap(model = clf1_w_u, importance_measure = 'shap', classification = True)

In [None]:
feature_selector1_w_u.fit(X=X_train, y=y_train1, n_trials = 100)

feature_selector1_w_u.plot(X_size=12, figsize=(12,8),
            y_scale='log', which_features='all')

In [None]:
feature_selector1_w_u.TentativeRoughFix()

In [None]:
# Returns a subset of the original data with the selected features
subset = feature_selector1_w_u.Subset()
print(subset)

In [None]:
subset.to_excel('BS_subset1_train_with_und.xlsx')

In [None]:
feature_selector1_w_u.results_to_csv(filename='feature_importance1_with_und')


In [None]:
#transitioning to the two-class case
y_train.replace(-1, 0, inplace=True)
y_test.replace(-1, 0, inplace=True)
y_val.replace(-1, 0, inplace=True)
class_names = ['Not Staying',"Staying"]

In [None]:
#create a feature selector and fit it, same as before, just condensed
feature_selector0_n_u = BorutaShap.BorutaShap(model = clf0_n_u, importance_measure = 'shap', classification = True)
feature_selector0_n_u.fit(X=X_train, y=y_train0, n_trials = 100)
feature_selector0_n_u.TentativeRoughFix()
subset = feature_selector0_n_u.Subset()
subset.to_excel('BS_subset0_train_no_und.xlsx')
feature_selector0_w_u.results_to_csv(filename='feature_importance0_no_und')

In [None]:
#create a feature selector and fit it, same as before, just condensed
feature_selector1_n_u = BorutaShap.BorutaShap(model = clf1_n_u, importance_measure = 'shap', classification = True)
feature_selector1_n_u.fit(X=X_train, y=y_train1, n_trials = 100)
feature_selector1_n_u.TentativeRoughFix()
subset = feature_selector1_n_u.Subset()
subset.to_excel('BS_subset1_train_no_und.xlsx')
feature_selector1_w_u.results_to_csv(filename='feature_importance1_no_und')