In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score
from sklearn.feature_selection import SelectFromModel
import numpy as np
import matplotlib.pyplot as plt


In [None]:
#currently shows for the two class case. Can be changed by adding the "Undecided" class name and
#removing the mapping of all -1's to 0's
class_names = ['Not Staying', "Staying"]
X_train= pd.read_excel('X_train.xlsx')
X_test=pd.read_excel('X_test.xlsx')
X_val=pd.read_excel('X_val.xlsx')
y_train=pd.read_excel('y_train.xlsx')
y_test=pd.read_excel('y_test.xlsx')
y_val=pd.read_excel('y_val.xlsx')
y_train.replace(-1, 0, inplace=True)
y_test.replace(-1, 0, inplace=True)
y_val.replace(-1, 0, inplace=True)

#y dataframes ending in 0 are for predicting post-ADSC retention
#y dataframes ending in 1 are for predicting retention until retirement
y_train0 = y_train['intention_beyond_commitment']
y_train1 = y_train['intention_toward_retirement']
y_val0 = y_val['intention_beyond_commitment']
y_val1 = y_val['intention_toward_retirement']
y_test0 = y_test['intention_beyond_commitment']
y_test1 = y_test['intention_toward_retirement']


In [None]:
#Normalizing our data based on the distribution of the training data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns = X_val.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [None]:
#Creating and training our random forest for predicting post-ADSC retention
clf0 = RandomForestClassifier(n_estimators=100, max_depth = 75)
clf0.fit(X_train, y_train0)
y_pred = clf0.predict(X_val)
accuracy = accuracy_score(y_val0, y_pred)
precision = precision_score(y_val0, y_pred)
recall = recall_score(y_val0, y_pred)
# Display the results
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")


cm = confusion_matrix(y_val0, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", xticklabels=class_names, yticklabels=class_names)
plt.title('Intention Toward Staying After Committment')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')

# Display the plot
plt.show()


In [None]:
#creating chart to select the number of features to move forward with
feature_importances= clf0.feature_importances_
# Sort feature importances in descending order
sorted_importances = np.sort(feature_importances)[::-1]

# Calculate cumulative sum
cumulative_importance = np.cumsum(sorted_importances)

# Plot the cumulative importance
plt.plot(range(1, len(cumulative_importance) + 1), cumulative_importance)
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Importance')
plt.title('Cumulative Feature Importance')
plt.grid(True)
plt.show()

In [None]:
#Taking top 40 features
top_40_indices = np.argsort(feature_importances)[::-1][:40]


In [None]:
#New data using our selected features
X_train0 = X_train.iloc[:, top_40_indices]
X_val0 = X_val.iloc[:, top_40_indices]
X_test0 = X_test.iloc[:, top_40_indices]

In [None]:
#New Random forest using only the selected features
clf0 = RandomForestClassifier(n_estimators=100, max_depth = 75)
clf0.fit(X_train0, y_train0)
y_pred = clf0.predict(X_val0)
accuracy = accuracy_score(y_val0, y_pred)
precision = precision_score(y_val0, y_pred)
recall = recall_score(y_val0, y_pred)
# Display the results
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")

cm = confusion_matrix(y_val0, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", xticklabels=class_names, yticklabels=class_names)
plt.title('Intention Toward Staying After Committment')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')

# Display the plot
plt.show()




# TEST

In [None]:
#Newly trained RF using same stuff as before, this time used on testing data
clf0 = RandomForestClassifier(n_estimators=100, max_depth = 75)
clf0.fit(X_train0, y_train0)
y_pred = clf0.predict(X_test0)
accuracy = accuracy_score(y_test0, y_pred)
precision = precision_score(y_test0, y_pred)
recall = recall_score(y_test0, y_pred)
# Display the results
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(report)
top_n = 20
cm = confusion_matrix(y_test0, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.title('Intention Toward Staying After Committment')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('ADSC_RF_no_und_CF.png')



feature_importances= clf0.feature_importances_

feature_names = X_train0.columns.tolist()
sorted_idx = feature_importances.argsort()[::-1]
sorted_idx = [x for x in sorted_idx if x != 7]


top_features = [feature_names[i] for i in sorted_idx[:top_n]]

top_importances = [feature_importances[i] for i in sorted_idx[:top_n]]


# Create the bar plot
plt.figure(figsize=(10, 6))
plt.barh(range(top_n), top_importances, align="center")
plt.yticks(range(top_n), top_features)
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title('Top 20 Feature Importances in Predicting Post-ADSC \nRetention Utilising Random Forest')
plt.tight_layout()
plt.savefig('ADSC_RF_no_und.png')

In [None]:
#Creating and training our random forest for predicting retention until retirement
clf1 = RandomForestClassifier(n_estimators=500, max_depth = 50)
clf1.fit(X_train, y_train1)
y_pred = clf1.predict(X_val)


accuracy = accuracy_score(y_val1, y_pred)
precision = precision_score(y_val1, y_pred)
recall = recall_score(y_val1, y_pred)
# Display the results
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
report = classification_report(y_val1, y_pred)



print("Classification Report:")
print(report)

cm = confusion_matrix(y_val1, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", xticklabels=class_names, yticklabels=class_names)
plt.title('Intention Toward Staying Until Retirement')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')

# Display the plot
plt.show()



In [None]:
#creating chart to select the number of features to move forward with

# Sort feature importances in descending order
feature_importances = clf1.feature_importances_
sorted_importances = np.sort(feature_importances)[::-1]

# Calculate cumulative sum
cumulative_importance = np.cumsum(sorted_importances)

# Plot the cumulative importance
plt.plot(range(1, len(cumulative_importance) + 1), cumulative_importance)
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Importance')
plt.title('Cumulative Feature Importance')
plt.grid(True)
plt.show()

In [None]:
#Taking top 80 features
top_80_indices = np.argsort(feature_importances)[::-1][:80]

#New data using only our selected features on validation data
X_train1 = X_train.iloc[:, top_80_indices]
X_val1 = X_val.iloc[:, top_80_indices]
X_test1 = X_test.iloc[:, top_80_indices]
#training new RF for use on validation data
clf1 = RandomForestClassifier(n_estimators=500, max_depth = 50)
clf1.fit(X_train1, y_train1)
y_pred = clf1.predict(X_val1)
accuracy = accuracy_score(y_val1, y_pred)
precision = precision_score(y_val1, y_pred)
recall = recall_score(y_val1, y_pred)
# Display the results
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")

cm = confusion_matrix(y_val1, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", xticklabels=class_names, yticklabels=class_names)
plt.title('Intention Toward Staying Until Retirement')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')

# Display the plot
plt.show()

# TEST

In [None]:
#Using the selected features to trian a new RF for test data
clf1 = RandomForestClassifier(n_estimators=500, max_depth = 50)
clf1.fit(X_train1, y_train1)
y_pred = clf1.predict(X_test1)
accuracy = accuracy_score(y_test1, y_pred)
precision = precision_score(y_test1, y_pred)
recall = recall_score(y_test1, y_pred)
# Display the results
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")

cm = confusion_matrix(y_test1, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.title('Intention Toward Staying Until Retirement')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')

plt.tight_layout()
plt.savefig('retirement_RF_no_und_CF.png')


feature_importances= clf1.feature_importances_
feature_names = X_train1.columns.tolist()
sorted_idx = feature_importances.argsort()[::-1]
sorted_idx = [x for x in sorted_idx if x != 4]

# Get the top 20 most important features 

top_features = [feature_names[i] for i in sorted_idx[:top_n]]
top_importances = [feature_importances[i] for i in sorted_idx[:top_n]]

# Create the bar plot
plt.figure(figsize=(10, 6))
plt.barh(range(top_n), top_importances, align="center")
plt.yticks(range(top_n), top_features)
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title('Top 20 Feature Importances in Predicting Retention \nUntil Retirement Utilising Random Forest')
plt.tight_layout()
plt.savefig('retirement_RF_no_und.png')