In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sns
from sklearn.metrics import mean_squared_error
from Method.GradientBoost import XGBoostRegressor as xgb
import pickle

In [8]:
train_data = pd.read_csv("Processed Data/train.csv")
validation_data = pd.read_csv("Processed Data/validation.csv")

In [9]:
X_train = train_data.drop(columns=['Attrition_rate']).values
y_train = train_data['Attrition_rate'].values

In [10]:
X_val = validation_data.drop(columns=['Attrition_rate']).values
y_val = validation_data['Attrition_rate'].values

In [11]:
best_params = {
    'subsample_cols': 0.706,
    'min_child_weight': 2,
    'depth': 4,
    'min_leaf': 7,
    'learning_rate': 0.056,
    'boosting_rounds': 15,
    'lambda_': 1.168,
    'gamma': 0.243,
    'eps': 0.1
}

In [12]:
final_model = xgb()
final_model.fit(X_train, y_train, **best_params)

KeyboardInterrupt: 

In [None]:
def save_model(model, filename):
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
save_model(final_model, 'final_model.pkl')

In [13]:
def load_model(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)
loaded_model = load_model('final_model.pkl')

ModuleNotFoundError: No module named 'GradientBoost'

In [None]:
y_pred_val = loaded_model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred_val, squared=False)
print(f"RMSE on validation set (loaded model): {rmse}")

Feature Importance

In [None]:
feature_names = list(train_data.columns[:-1])
feature_names

In [None]:
test_data = pd.read_csv("Processed Data/test.csv")
X_test = test_data.values
y_pred_test = loaded_model.predict(X_test)
test_data['Attrition_rate'] = y_pred_test
test_data.to_csv('test_predictions_final.csv', index=False)

In [None]:
feature_importances_weight = loaded_model.get_feature_importance(importance_type='weight')
feature_importances_gain = loaded_model.get_feature_importance(importance_type='gain')

feature_importances_weight_sorted = dict(sorted(feature_importances_weight.items(), key=lambda item: item[1], reverse=True))
feature_importances_gain_sorted = dict(sorted(feature_importances_gain.items(), key=lambda item: item[1], reverse=True))

Plotting Feature Importances (Weight)

In [None]:
features_weight = list(feature_importances_weight_sorted.keys())
importances_weight = list(feature_importances_weight_sorted.values())
plt.figure(figsize=(10, 6))
plt.barh(range(len(features_weight)), importances_weight, align='center')
plt.yticks(np.arange(len(features_weight)), [feature_names[i] for i in features_weight])
plt.xlabel('Feature Importance (Weight)')
plt.title('Feature Importance (Weight)')
plt.tight_layout()
plt.show()

Plotting Feature Importances (Gain)

In [None]:
features_gain = list(feature_importances_gain_sorted.keys())
importances_gain = list(feature_importances_gain_sorted.values())

plt.figure(figsize=(10, 6))
plt.barh(range(len(features_gain)), importances_gain, align='center')
plt.yticks(np.arange(len(features_gain)), [feature_names[i] for i in features_gain])
plt.xlabel('Feature Importance (Gain)')
plt.title('Feature Importance (Gain)')
plt.tight_layout()
plt.show()

Biểu đồ phân phối lỗi (Residual Distribution)

In [None]:
y_pred_val = loaded_model.predict(X_val)
residuals_val = y_val - y_pred_val

plt.figure(figsize=(8, 6))
count, bins, _ = plt.hist(residuals_val, bins=30, alpha=0.7, color='blue', density=True, label='Histogram')

from scipy.stats import gaussian_kde
kde = gaussian_kde(residuals_val)
x_vals = np.linspace(min(bins), max(bins), 1000)
plt.plot(x_vals, kde(x_vals), color='red', label='KDE')


plt.title('Residual Distribution (Validation Data)')
plt.xlabel('Residuals')
plt.ylabel('Density')
plt.legend()
plt.show()


Biểu đồ thực tế vs dự đoán

In [None]:
from sklearn.preprocessing import LabelEncoder

for col in train_data.select_dtypes(include=['object']).columns:
    train_data[col] = LabelEncoder().fit_transform(train_data[col])

corr_matrix = train_data.corr()

plt.figure(figsize=(15, 15))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix (with Encoded Features)')
plt.show()


Correlation Heatmap

In [None]:
corr_matrix = train_data.corr()
plt.figure(figsize=(10, 8))
seaborn.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

Biểu đồ xu hướng nghỉ việc (Đang lỗi)

In [None]:
test_predictions = pd.read_csv("test_predictions_final.csv")

In [None]:
time_variables = ['Time_of_service', 'Time_since_promotion', 'growth_rate']


for time_var in time_variables:
    plt.figure(figsize=(10, 6))


    plt.plot(test_predictions[time_var], test_predictions['Attrition_rate'], label='Test Data (Predicted)')

    plt.xlabel(time_var)
    plt.ylabel('Attrition Rate (Predicted)')
    plt.title(f'Attrition Rate Trend vs. {time_var} (Test Data)')
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()