In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
import pickle
from math import e
from collections import Counter
from Method.GradientBoost import XGBoostRegressor as xgb
from Method.DecisionTree import DecisionTree

In [2]:
train_data = pd.read_csv("Processed Data/train.csv")
validation_data = pd.read_csv("Processed Data/validation.csv")

In [3]:
X_train = train_data.drop(columns=['Attrition_rate']).values
y_train = train_data['Attrition_rate'].values

In [6]:
model = xgb()
model.fit(X_train, y_train)

In [8]:
def save_model(model, filename):
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
save_model(model, 'trained_xgb_model.pkl')

In [11]:
def load_model(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)
loaded_model = load_model('trained_xgb_model.pkl')

In [4]:
X_val = validation_data.drop(columns=['Attrition_rate']).values
y_val = validation_data['Attrition_rate'].values

In [12]:
y_pred_val = loaded_model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred_val, squared=False)
print(f"RMSE on validation set (loaded model): {rmse}")

RMSE on validation set (loaded model): 0.1807476509770369




In [13]:
test_data = pd.read_csv("Processed Data/test.csv")
X_test = test_data.values

In [14]:
y_pred_test = model.predict(X_test)

In [15]:
test_data['Attrition_rate'] = y_pred_test
test_data.to_csv('test_predictions2.csv', index=False)

Cross Validation

In [17]:
import optuna
def cross_validation(X_train, y_train, X_val, y_val, n_splits=5, **kwargs):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv_rmse_scores = []
    val_rmse_scores = []
    trained_models = []

    for i, (train_index, test_index) in enumerate(kf.split(X_train)):
        print(f"Fold {i+1}:")
        X_fold_train, X_fold_test = X_train[train_index], X_train[test_index]
        y_fold_train, y_fold_test = y_train[train_index], y_train[test_index]
        print(f"End task input")
        model = xgb()
        model.fit(X_fold_train, y_fold_train, **kwargs)
        print(f"End task fit")
        y_pred_fold_test = model.predict(X_fold_test)
        cv_rmse = mean_squared_error(y_fold_test, y_pred_fold_test, squared=False)
        cv_rmse_scores.append(cv_rmse)

        y_pred_val = model.predict(X_val)
        val_rmse = mean_squared_error(y_val, y_pred_val, squared=False)
        val_rmse_scores.append(val_rmse)
        trained_models.append(model)

        print(f"  Training RMSE: {mean_squared_error(y_fold_train, model.predict(X_fold_train), squared=False)}")
        print(f"  Cross-Validation RMSE: {cv_rmse}")
        print(f"  Validation RMSE: {val_rmse}\n")

    avg_cv_rmse = np.mean(cv_rmse_scores)
    avg_val_rmse = np.mean(val_rmse_scores)

    print(f"Average Cross-Validation RMSE: {avg_cv_rmse}")
    print(f"Average Validation RMSE: {avg_val_rmse}")

    best_model_index = np.argmin(val_rmse_scores)
    best_model = trained_models[best_model_index]
    return avg_val_rmse, best_model, avg_cv_rmse

def objective(trial):
    params = {
        'subsample_cols': trial.suggest_float('subsample_cols', 0.6, 0.8),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'depth': trial.suggest_int('depth', 3, 5),
        'min_leaf': trial.suggest_int('min_leaf', 2, 7),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.15),
        'boosting_rounds': trial.suggest_int('boosting_rounds', 10, 30),
        'lambda_': trial.suggest_float('lambda_', 1, 2.5),
        'gamma': trial.suggest_float('gamma', 0, 1.3),
        'eps': trial.suggest_float('eps', 0.08, 0.1)
    }
    print("Current Parameters:", params)

    avg_val_rmse, best_model, avg_cv_rmse =  cross_validation(X_train, y_train, X_val, y_val, n_splits=5, **params)
    return avg_val_rmse, best_model, avg_cv_rmse


In [18]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

[I 2024-12-04 14:44:55,218] A new study created in memory with name: no-name-cd5c6ae5-1581-41d4-8798-014327cc95e3


Fold 1:
End task input


[W 2024-12-04 15:09:50,550] Trial 0 failed with parameters: {'subsample_cols': 0.7380539246355304, 'min_child_weight': 6, 'depth': 8, 'min_leaf': 4, 'learning_rate': 0.07223563737960809, 'boosting_rounds': 36, 'lambda_': 2.3200643177388707, 'gamma': 0.7789896537593365, 'eps': 0.08191601305731473} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\HP\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\HP\AppData\Local\Temp\ipykernel_2260\3017538929.py", line 52, in objective
    avg_val_rmse, best_model, avg_cv_rmse =  cross_validation(X_train, y_train, X_val, y_val, n_splits=5, **params)
                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\HP\AppDat

KeyboardInterrupt: 

In [None]:
print(f"Best Parameters: {study.best_params}")
print(f"Best Validation RMSE: {study.best_value}")

In [None]:
def save_model(model, filename):
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
save_model(best_model, 'best_xgb_model.pkl')