In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score
from sklearn.metrics import precision_recall_fscore_support
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#currently shows for the three class case. Can be changed by removing the "Undecided" class name and mapping all
# -1's to 0's
#This one is slightly different than the others due to how XGBoost runs
class_names = ["Undecided", "Staying", "Not Staying"]
X_train= pd.read_excel('X_train.xlsx')
X_test=pd.read_excel('X_test.xlsx')
X_val=pd.read_excel('X_val.xlsx')
y_train=pd.read_excel('y_train.xlsx')
y_test=pd.read_excel('y_test.xlsx')
y_val=pd.read_excel('y_val.xlsx')

#This mapping is necessary because XGBoost doesnt like the negative class label
y_train.replace(-1, 2, inplace=True)
y_test.replace(-1, 2, inplace=True)
y_val.replace(-1, 2, inplace=True)

#y dataframes ending in 0 are for predicting post-ADSC retention
#y dataframes ending in 1 are for predicting retention until retirement
y_train0 = y_train['intention_beyond_commitment']
y_train1 = y_train['intention_toward_retirement']
y_val0 = y_val['intention_beyond_commitment']
y_val1 = y_val['intention_toward_retirement']
y_test0 = y_test['intention_beyond_commitment']
y_test1 = y_test['intention_toward_retirement']
top_n = 20

In [None]:
#Normalizing our data based on the distribution of the training data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns = X_val.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [None]:
#Training an XGBoost model for post-ADSC Retention
mod0 = xgb.XGBClassifier(n_estimators = 20, max_depth = 15)
mod0.fit(X_train, y_train0)
y_pred = mod0.predict(X_val)
accuracy = accuracy_score(y_val0, y_pred)
average_type = 'macro'  # Can be 'micro', 'macro', or 'weighted'
precision_avg, recall_avg, f1_avg, _ = precision_recall_fscore_support(y_val0, y_pred, average=average_type)

print(f"\nOverall Accuracy ({average_type}): ", accuracy)
print(f"\nOverall Precision ({average_type}): ", precision_avg)
print(f"Overall Recall ({average_type}): ", recall_avg)
print(f"Overall F1 Score ({average_type}): ", f1_avg)


cm = confusion_matrix(y_val0, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", xticklabels=class_names, yticklabels=class_names)
plt.title('Intention Toward Staying After Committment')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')

# Display the plot
plt.show()


feature_importance = mod0.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})

# Sort features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the sorted feature importance
print(feature_importance_df)

feature_importances= mod0.feature_importances_
# Sort feature importances in descending order
sorted_importances = np.sort(feature_importances)[::-1]

# Calculate cumulative sum
cumulative_importance = np.cumsum(sorted_importances)

# Plot the cumulative importance
plt.plot(range(1, len(cumulative_importance) + 1), cumulative_importance)
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Importance')
plt.title('Cumulative Feature Importance')
plt.grid(True)
plt.show()


#Select the top 100 Features, this was done by eyeing the chart
selected_features = feature_importance_df['Feature'][:100].tolist()
X_train0 = X_train[selected_features]
X_val0 = X_val[selected_features]
X_test0 = X_test[selected_features]

#New XGBoost model using only the features selected in the first model on validation data
mod0fs = xgb.XGBClassifier(n_estimators = 20, max_depth = 15)
mod0fs.fit(X_train0, y_train0)


y_pred = mod0fs.predict(X_val0)
accuracy = accuracy_score(y_val0, y_pred)
precision_avg, recall_avg, f1_avg, _ = precision_recall_fscore_support(y_val0, y_pred, average=average_type)
print(f"\nOverall Accuracy ({average_type}): ", accuracy)
print(f"\nOverall Precision ({average_type}): ", precision_avg)
print(f"Overall Recall ({average_type}): ", recall_avg)
print(f"Overall F1 Score ({average_type}): ", f1_avg)


cm = confusion_matrix(y_val0, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", xticklabels=class_names, yticklabels=class_names)
plt.title('Intention Toward Staying After Committment')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')

# Display the plot
plt.show()

#Using our model on the test set as well
y_pred = mod0fs.predict(X_test0)
accuracy = accuracy_score(y_test0, y_pred)
report = classification_report(y_test0, y_pred)

# Display the results
print(f"Accuracy: {accuracy * 100:.2f}%")
precision_avg, recall_avg, f1_avg, _ = precision_recall_fscore_support(y_test0, y_pred, average=average_type)

print(f"\nOverall Precision ({average_type}): ", precision_avg)
print(f"Overall Recall ({average_type}): ", recall_avg)
print(f"Overall F1 Score ({average_type}): ", f1_avg)


cm = confusion_matrix(y_test0, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", xticklabels=class_names, yticklabels=class_names)
plt.title('Intention Toward Staying After Committment')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')

# Display the plot
plt.show()
feature_importances= mod0fs.feature_importances_
feature_names = X_train0.columns.tolist()
sorted_idx = feature_importances.argsort()[::-1]

# Get the top 20 most important features

top_features = [feature_names[i] for i in sorted_idx[:top_n]]
top_importances = [feature_importances[i] for i in sorted_idx[:top_n]]

# Create the bar plot
plt.figure(figsize=(10, 6))
plt.barh(range(top_n), top_importances, align="center")
plt.yticks(range(top_n), top_features)
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title('Top 20 Feature Importances in Predicting Post-ADSC \nRetention Utilising XGBoost')
#plt.show()
plt.tight_layout()
plt.savefig('ADSC_XGB_with_und.png')

In [None]:
#Training an XGBoost model for retention until retirement
mod1 = xgb.XGBClassifier(n_estimators = 20, max_depth = 20)
mod1.fit(X_train, y_train1)
y_pred = mod1.predict(X_val)
accuracy = accuracy_score(y_val1, y_pred)
report = classification_report(y_val1, y_pred)

# Display the results
print(f"Accuracy: {accuracy * 100:.2f}%")
precision_avg, recall_avg, f1_avg, _ = precision_recall_fscore_support(y_val1, y_pred, average=average_type)

print(f"\nOverall Precision ({average_type}): ", precision_avg)
print(f"Overall Recall ({average_type}): ", recall_avg)
print(f"Overall F1 Score ({average_type}): ", f1_avg)


cm = confusion_matrix(y_val1, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", xticklabels=class_names, yticklabels=class_names)
plt.title('Intention Toward Staying After Committment')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')

# Display the plot
plt.show()
feature_importance = mod1.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})

# Sort features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the sorted feature importance
print(feature_importance_df)

feature_importances= mod1.feature_importances_
# Sort feature importances in descending order
sorted_importances = np.sort(feature_importances)[::-1]

# Calculate cumulative sum
cumulative_importance = np.cumsum(sorted_importances)

# Plot the cumulative importance
plt.plot(range(1, len(cumulative_importance) + 1), cumulative_importance)
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Importance')
plt.title('Cumulative Feature Importance')
plt.grid(True)
plt.show()


#Select the top 110 features, this was done by eyeing the chart
selected_features = feature_importance_df['Feature'][:110].tolist()
X_train1 = X_train[selected_features]
X_val1 = X_val[selected_features]
X_test1 = X_test[selected_features]

#New XGBoost model using only the features selected in the first model on validation data
mod1fs = xgb.XGBClassifier(n_estimators = 20, max_depth = 20)
mod1fs.fit(X_train1, y_train1)


y_pred = mod1fs.predict(X_val1)
accuracy = accuracy_score(y_val1, y_pred)
report = classification_report(y_val1, y_pred)

# Display the results
print(f"Accuracy: {accuracy * 100:.2f}%")
precision_avg, recall_avg, f1_avg, _ = precision_recall_fscore_support(y_val1, y_pred, average=average_type)

print(f"\nOverall Precision ({average_type}): ", precision_avg)
print(f"Overall Recall ({average_type}): ", recall_avg)
print(f"Overall F1 Score ({average_type}): ", f1_avg)


cm = confusion_matrix(y_val1, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", xticklabels=class_names, yticklabels=class_names)
plt.title('Intention Toward Staying After Committment')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')

# Display the plot
plt.show()


#Using our model on the test set as well
y_pred = mod1fs.predict(X_test1)
accuracy = accuracy_score(y_test1, y_pred)
report = classification_report(y_test1, y_pred)

# Display the results
print(f"Accuracy: {accuracy * 100:.2f}%")
precision_avg, recall_avg, f1_avg, _ = precision_recall_fscore_support(y_test1, y_pred, average=average_type)

print(f"\nOverall Precision ({average_type}): ", precision_avg)
print(f"Overall Recall ({average_type}): ", recall_avg)
print(f"Overall F1 Score ({average_type}): ", f1_avg)


cm = confusion_matrix(y_test1, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", xticklabels=class_names, yticklabels=class_names)
plt.title('Intention Toward Staying After Committment')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')

# Display the plot
plt.show()

feature_importances= mod1fs.feature_importances_
feature_names = X_train1.columns.tolist()
sorted_idx = feature_importances.argsort()[::-1]

# Get the top N most important features (change N to the desired number)

top_features = [feature_names[i] for i in sorted_idx[:top_n]]
top_importances = [feature_importances[i] for i in sorted_idx[:top_n]]

# Create the bar plot
plt.figure(figsize=(10, 6))
plt.barh(range(top_n), top_importances, align="center")
plt.yticks(range(top_n), top_features)
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title('Top 20 Feature Importances in Predicting Retention \nUntil Retirement Utilising XGBoost')
#plt.show()
plt.tight_layout()
plt.savefig('retirement_XGB_with_und.png')