In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score
from sklearn.metrics import precision_recall_fscore_support

In [None]:
#currently shows for the two class case. Can be changed by adding the "Undecided" class name and 
#removing the mapping of all -1's to 0's
class_names = ['Not Staying', "Staying"]
X_train= pd.read_excel('BS_subset0_train_no_und.xlsx')
#here we're using a subset of the training data found via BorutaSHAP
#It will only keep the features found in that subset in all three sets
X_test=pd.read_excel('X_test.xlsx')
X_val=pd.read_excel('X_val.xlsx')
X_val = X_val[X_train.columns.intersection(X_val.columns)]
X_test = X_test[X_train.columns.intersection(X_test.columns)]
y_train=pd.read_excel('y_train.xlsx')
y_test=pd.read_excel('y_test.xlsx')
y_val=pd.read_excel('y_val.xlsx')

y_train.replace(-1, 0, inplace=True)
y_test.replace(-1, 0, inplace=True)
y_val.replace(-1, 0, inplace=True)


#y dataframes ending in 0 are for predicting post-ADSC retention
#y dataframes ending in 1 are for predicting retention until retirement
y_train0 = y_train['intention_beyond_commitment']
y_train1 = y_train['intention_toward_retirement']
y_val0 = y_val['intention_beyond_commitment']
y_val1 = y_val['intention_toward_retirement']
y_test0 = y_test['intention_beyond_commitment']
y_test1 = y_test['intention_toward_retirement']

#Normalizing our data based on the distribution of the training data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns = X_val.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [None]:
#Training a RF using the subset of features found via BorutaSHAP for retention after ADSC
clf = RandomForestClassifier(n_estimators=100, max_depth = 25)
clf.fit(X_train, y_train0)

In [None]:
#Calculating performance, currently shows for test, but can easily be switched for validation
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test0, y_pred)
average_type = 'macro'  # Can be 'micro', 'macro', or 'weighted'
precision_avg, recall_avg, f1_avg, _ = precision_recall_fscore_support(y_test0, y_pred, average=average_type)

# Display the results
print(f"\nOverall Accuracy ({average_type}): ", accuracy)
print(f"\nOverall Precision ({average_type}): ", precision_avg)
print(f"Overall Recall ({average_type}): ", recall_avg)
print(f"Overall F1 Score ({average_type}): ", f1_avg)

In [None]:
cm = confusion_matrix(y_test0, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.title('Intention Toward Staying After Committment')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')

# Display the plot
plt.show()

In [None]:
#currently shows for the three class case. Can be changed by removing the "Undecided" class name and mapping all
# -1's to 0's
class_names = ['Not Staying','Undecided', "Staying"]
X_train = pd.read_excel('BS_subset1_train_with_und.xlsx')
X_test=pd.read_excel('X_test.xlsx')
X_val=pd.read_excel('X_val.xlsx')
y_train=pd.read_excel('y_train.xlsx')
y_test=pd.read_excel('y_test.xlsx')
y_val=pd.read_excel('y_val.xlsx')
#here we're using a subset of the training data found via BorutaSHAP
#It will only keep the features found in that subset in all three sets
X_val = X_val[X_train.columns.intersection(X_val.columns)]
X_test = X_test[X_train.columns.intersection(X_test.columns)]
y_train0 = y_train['intention_beyond_commitment']
y_train1 = y_train['intention_toward_retirement']
y_val0 = y_val['intention_beyond_commitment']
y_val1 = y_val['intention_toward_retirement']
y_test0 = y_test['intention_beyond_commitment']
y_test1 = y_test['intention_toward_retirement']

#Normalizing our data based on the distribution of the training data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns = X_val.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [None]:
#Training a RF using the subset of features found via BorutaSHAP for retention until retirement
clf = RandomForestClassifier(n_estimators=100, max_depth = 15)
clf.fit(X_train, y_train1)

In [None]:
#Calculating performance, currently shows for test, but can easily be switched for validation
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test1, y_pred)
precision_avg, recall_avg, f1_avg, _ = precision_recall_fscore_support(y_test1, y_pred, average=average_type)

print(f"\nOverall Accuracy ({average_type}): ", accuracy)
print(f"\nOverall Precision ({average_type}): ", precision_avg)
print(f"Overall Recall ({average_type}): ", recall_avg)
print(f"Overall F1 Score ({average_type}): ", f1_avg)

In [None]:
cm = confusion_matrix(y_val1, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", cbar=False, square=True, xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Intention Toward Staying Until Retirement')
plt.show()
# Display the plot
plt.show()