In [23]:
import sys
import os
import pandas as pd
# Add your project folder path to sys.path
sys.path.append(os.path.abspath("F:/Customer-Retention-Prediction"))
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV ,StratifiedKFold
from utils.utils import save_model_or_data,load_data
from sklearn.metrics import accuracy_score


In [24]:
xgb_top = XGBClassifier(
    colsample_bytree=0.8,     # use 80% of features per tree
    reg_alpha=0.1,            # L1 regularization
    reg_lambda=1.0,          # L2 regularization
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
param_grid = {
    'n_estimators': [200, 400, 600],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}


In [25]:
X_train=pd.read_csv("F:\\Customer-Retention-Prediction\\data\\Processed_data\\X_train_scaled.csv")
y_train=pd.read_csv("F:\\Customer-Retention-Prediction\\data\\Processed_data\\y_train.csv")
X_val=pd.read_csv("F:\\Customer-Retention-Prediction\\data\\Processed_data\\X_val_scaled.csv")
y_val=pd.read_csv("F:\\Customer-Retention-Prediction\\data\\Processed_data\\y_val.csv")  

##### Feature selection

In [26]:
# Train initial model
xgb_temp = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_temp.fit(X_train, y_train)

# Get feature importances
importances = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_temp.feature_importances_
}).sort_values(by='importance', ascending=False)

print(importances)

Parameters: { "use_label_encoder" } are not used.



                       feature  importance
3                  Total_Spend    0.092746
2                Annual_Income    0.087605
8      Num_of_Support_Contacts    0.087033
10      Last_Purchase_Days_Ago    0.085353
1                       Gender    0.083886
0                          Age    0.080224
5             Num_of_Purchases    0.078081
6   Average_Transaction_Amount    0.077283
12          Promotion_Response    0.076784
9           Satisfaction_Score    0.076527
7               Num_of_Returns    0.073268
4            Years_as_Customer    0.068351
11                Email_Opt_In    0.032860


In [27]:
# Keep top N features
top_features = importances['feature'].iloc[:4]
X_train_top = X_train[top_features]
X_val_top = X_val[top_features]

In [28]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(xgb_top, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
grid.fit(X_train_top, y_train, eval_set=[(X_val_top, y_val)], verbose=True)

print("Best Parameters:", grid.best_params_)
print("Best CV Score:", grid.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[0]	validation_0-logloss:0.69477
[1]	validation_0-logloss:0.69617
[2]	validation_0-logloss:0.69743
[3]	validation_0-logloss:0.69930
[4]	validation_0-logloss:0.70040
[5]	validation_0-logloss:0.70221
[6]	validation_0-logloss:0.70315
[7]	validation_0-logloss:0.70476
[8]	validation_0-logloss:0.70699
[9]	validation_0-logloss:0.70924
[10]	validation_0-logloss:0.71136
[11]	validation_0-logloss:0.71271
[12]	validation_0-logloss:0.71360
[13]	validation_0-logloss:0.71397
[14]	validation_0-logloss:0.71566
[15]	validation_0-logloss:0.71795
[16]	validation_0-logloss:0.71895
[17]	validation_0-logloss:0.71885
[18]	validation_0-logloss:0.71856
[19]	validation_0-logloss:0.72019
[20]	validation_0-logloss:0.72022
[21]	validation_0-logloss:0.72009
[22]	validation_0-logloss:0.72151
[23]	validation_0-logloss:0.72175
[24]	validation_0-logloss:0.72322
[25]	validation_0-logloss:0.72330
[26]	validation_0-logloss:0.72316
[27]	validation_0-logloss:0.72

Parameters: { "use_label_encoder" } are not used.



[32]	validation_0-logloss:0.72920
[33]	validation_0-logloss:0.72953
[34]	validation_0-logloss:0.73011
[35]	validation_0-logloss:0.73002
[36]	validation_0-logloss:0.72982
[37]	validation_0-logloss:0.73001
[38]	validation_0-logloss:0.73081
[39]	validation_0-logloss:0.73196
[40]	validation_0-logloss:0.73176
[41]	validation_0-logloss:0.73313
[42]	validation_0-logloss:0.73400
[43]	validation_0-logloss:0.73389
[44]	validation_0-logloss:0.73611
[45]	validation_0-logloss:0.73811
[46]	validation_0-logloss:0.73928
[47]	validation_0-logloss:0.73925
[48]	validation_0-logloss:0.73869
[49]	validation_0-logloss:0.73889
[50]	validation_0-logloss:0.73918
[51]	validation_0-logloss:0.73984
[52]	validation_0-logloss:0.74005
[53]	validation_0-logloss:0.73991
[54]	validation_0-logloss:0.74024
[55]	validation_0-logloss:0.73990
[56]	validation_0-logloss:0.73967
[57]	validation_0-logloss:0.74040
[58]	validation_0-logloss:0.74026
[59]	validation_0-logloss:0.74114
[60]	validation_0-logloss:0.74209
[61]	validatio

In [29]:
print("Best Parameters:", grid.best_params_)
print("Best CV Score:", grid.best_score_)

best_xgb = grid.best_estimator_

y_val_pred = best_xgb.predict(X_val_top)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))

Best Parameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 600, 'subsample': 1.0}
Best CV Score: 0.534375
Validation Accuracy: 0.5


In [30]:
y_train_pred = best_xgb.predict(X_train_top)
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))

Training Accuracy: 0.990625
