In [7]:
import optuna
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from xgboost import plot_importance
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

Data Preparation

In [8]:
db = pd.read_excel("Retail-Supply-Chain-Sales-Dataset.xlsx")
target = db.iloc[:,[9,4,6,7,11,12] + list(range(16,22))].copy()
target['Duration'] =db['Ship Date']-db['Order Date']
target['Ship Day'] = db['Ship Date'].dt.dayofweek
target['Ship Month']= db['Ship Date'].dt.month
target['Order Month'] = db['Order Date'].dt.month
target['Returned']=db['Returned'].map({'Yes':1,'Not':0})

def extract_brand(product_name):
    if isinstance(product_name,str):
        return product_name.split()[0].strip(",").title()
    return "Unknown"
target['Brand']=target['Product Name'].apply(extract_brand)
target['Duration']=target['Duration'].dt.days
encoding_cols = ['Ship Mode','Segment','City','Sub-Category','Brand','Region']

for cols in encoding_cols:
    for cols in encoding_cols:
        le = LabelEncoder()
        target[cols] = le.fit_transform(target[cols])
target.drop(columns=['Product Name','Customer Name','Sales'],inplace= True)

X = target.drop(columns=['Returned'])
y = target['Returned']
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)
x_est,x_final,y_est,y_final = train_test_split(x_train,y_train,test_size=0.2,random_state=42,stratify =y_train )


Optuna Tuning For XGBoost

In [10]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [100, 250, 350]),
        'max_depth': trial.suggest_categorical('max_depth', [5, 15, 25]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.1, 0.3]),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0),
        'eval_metric': 'logloss',
        'random_state': 42
    }
    model = XGBClassifier(**params)
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    return accuracy_score(y_test, preds)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)
print("\n🔧 Best XGBoost Parameters:")
print(study.best_params)

[I 2025-06-27 22:36:21,362] A new study created in memory with name: no-name-bca55835-2782-4ddd-ac86-c584358b2edc
[I 2025-06-27 22:36:21,692] Trial 0 finished with value: 0.919959979989995 and parameters: {'n_estimators': 250, 'max_depth': 25, 'learning_rate': 0.01, 'subsample': 0.5572866586572861, 'colsample_bytree': 0.6816710794916206, 'gamma': 4.47876309545138, 'reg_alpha': 1.850725997796826, 'reg_lambda': 4.760376422921324}. Best is trial 0 with value: 0.919959979989995.
[I 2025-06-27 22:36:21,969] Trial 1 finished with value: 0.919959979989995 and parameters: {'n_estimators': 100, 'max_depth': 15, 'learning_rate': 0.01, 'subsample': 0.8288675217618944, 'colsample_bytree': 0.5937501904651324, 'gamma': 0.8532033061017918, 'reg_alpha': 1.7364792372494575, 'reg_lambda': 2.047570811489779}. Best is trial 0 with value: 0.919959979989995.
[I 2025-06-27 22:36:22,263] Trial 2 finished with value: 0.9224612306153076 and parameters: {'n_estimators': 250, 'max_depth': 5, 'learning_rate': 0.3,


🔧 Best XGBoost Parameters:
{'n_estimators': 250, 'max_depth': 5, 'learning_rate': 0.3, 'subsample': 0.8735719385283629, 'colsample_bytree': 0.9083390944557697, 'gamma': 3.2559685737668214, 'reg_alpha': 4.373852947175868, 'reg_lambda': 0.9519661108223093}


GridSearchCV : LightGBM

In [11]:
lgbm_param={
    'n_estimators': [100, 300],
    'learning_rate': [0.01, 0.1],
    'max_depth': [4, 6, 8],
    'num_leaves': [15, 31, 63],
    'min_child_samples': [10, 20],
    'subsample': [0.8, 1.0]
}

LGBM_model=LGBMClassifier(random_seed=42)
lgbm_grid=GridSearchCV(
    estimator=LGBM_model,
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    param_grid=lgbm_param
)
lgbm_grid.fit(x_train,y_train)
print('Best Params: ',lgbm_grid.best_params_)

[LightGBM] [Info] Number of positive: 640, number of negative: 7355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000317 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 860
[LightGBM] [Info] Number of data points in the train set: 7995, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080050 -> initscore=-2.441667
[LightGBM] [Info] Start training from score -2.441667
Best Params:  {'learning_rate': 0.1, 'max_depth': 8, 'min_child_samples': 10, 'n_estimators': 300, 'num_leaves': 63, 'subsample': 0.8}


GridSearchCV : Random Forest

In [None]:
rf_params={
    'n_estimators':[100,300],
    'max_depth':[10,20,None],
    'min_samples_split':[2,5],
    'min_samples_leaf':[1,2],
    'bootstrap':[True,False]
}
rf_search=RandomForestClassifier(random_state=42)
rf_best=GridSearchCV(
    estimator= rf_search,
    param_grid= rf_params,
    cv=3,
    scoring='accuracy',
    n_jobs=-1
)
rf_best.fit(x_train,y_train)
print("Random Forest: ",rf_best.best_params_)

GridSearchCV : CATBoost

In [None]:
cat_param_grid = {
    'iterations':[100,300],
    'depth':[4,6,8],
    'learning_rate':[0.01,0.1],
    'border_count':[32,64],
    'verbose':[0]
}
CAT_model = CatBoostClassifier(random_seed=42)
CAT_grid=GridSearchCV(
    estimator=CAT_model,
    param_grid=cat_param_grid,
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    verbose=1
)
CAT_grid.fit(x_train,y_train)
print("Best Case: ",CAT_grid.best_params_)