In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
import pickle
from math import e
from collections import Counter
from Method.GradientBoost import XGBoostRegressor as xgb
from Method.DecisionTree import DecisionTree

In [2]:
train_data = pd.read_csv("Processed Data/train.csv")
validation_data = pd.read_csv("Processed Data/validation.csv")

In [3]:
X_train = train_data.drop(columns=['Attrition_rate']).values
y_train = train_data['Attrition_rate'].values

In [None]:
model = xgb()
model.fit(X_train, y_train)

In [None]:
def save_model(model, filename):
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
save_model(model, 'trained_xgb_model.pkl')

In [5]:
def load_model(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)
loaded_model = load_model('final_model.pkl')

ModuleNotFoundError: No module named 'GradientBoost'

In [4]:
X_val = validation_data.drop(columns=['Attrition_rate']).values
y_val = validation_data['Attrition_rate'].values

In [None]:
y_pred_val = loaded_model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred_val, squared=False)
print(f"RMSE on validation set (loaded model): {rmse}")

Cross Validation

In [None]:
best_params = {
    'subsample_cols': 0.706,
    'min_child_weight': 2,
    'depth': 4,
    'min_leaf': 7,
    'learning_rate': 0.056,
    'boosting_rounds': 15,
    'lambda_': 1.168,
    'gamma': 0.243,
    'eps': 0.1
}

In [None]:
def cross_validation(X_train, y_train, X_val, y_val, n_splits=5, **kwargs):

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv_rmse_scores = []
    val_rmse_scores = []

    for i, (train_index, test_index) in enumerate(kf.split(X_train)):
        print(f"Fold {i+1}:")
        X_fold_train, X_fold_test = X_train[train_index], X_train[test_index]
        y_fold_train, y_fold_test = y_train[train_index], y_train[test_index]

        model = xgb()
        model.fit(X_fold_train, y_fold_train, **kwargs, early_stopping_rounds=5, X_fold_test, y_fold_test)


        y_pred_fold_test = model.predict(X_fold_test)
        cv_rmse = mean_squared_error(y_fold_test, y_pred_fold_test, squared=False)
        cv_rmse_scores.append(cv_rmse)

        y_pred_val = model.predict(X_val)
        val_rmse = mean_squared_error(y_val, y_pred_val, squared=False)
        val_rmse_scores.append(val_rmse)



        print(f"  Training RMSE: {mean_squared_error(y_fold_train, model.predict(X_fold_train), squared=False):.3f}")
        print(f"  Cross-Validation RMSE: {cv_rmse:.3f}")
        print(f"  Validation RMSE: {val_rmse:.3f}\n")
    avg_cv_rmse = np.mean(cv_rmse_scores)
    avg_val_rmse = np.mean(val_rmse_scores)

    print(f"Average Cross-Validation RMSE: {avg_cv_rmse:.3f}")
    print(f"Average Validation RMSE: {avg_val_rmse:.3f}")


    return avg_val_rmse, avg_cv_rmse

In [None]:
avg_val_rmse, avg_cv_rmse = cross_validation(X_train, y_train, X_val, y_val, n_splits=5, **best_params)

In [None]:
test_data = pd.read_csv("Processed Data/test.csv")
X_test = test_data.values
y_pred_test = model.predict(X_test)
test_data['Attrition_rate'] = y_pred_test
test_data.to_csv('test_predictions_final.csv', index=False)