In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shap
import optuna
from catboost import CatBoostClassifier
from xgboost import XGBClassifier, plot_importance
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [35]:
db = pd.read_excel("Retail-Supply-Chain-Sales-Dataset.xlsx")
target = db.iloc[:,[9,4,6,7,11,12] + list(range(16,22))].copy()

target['Duration'] =db['Ship Date']-db['Order Date']
target['Ship Day'] = db['Ship Date'].dt.dayofweek
target['Ship Month']= db['Ship Date'].dt.month
target['Order Month'] = db['Order Date'].dt.month
target['Returned']=db['Returned'].map({'Yes':1,'Not':0})

In [36]:
def extract_brand(product_name):
    if isinstance(product_name,str):
        return product_name.split()[0].strip(",").title()
    return "Unknown"
target['Brand']=target['Product Name'].apply(extract_brand)
target['Duration']=target['Duration'].dt.days

In [37]:
encoding_cols = ['Ship Mode','Segment','City','Sub-Category','Brand','Region']
for cols in encoding_cols:
     le = LabelEncoder()
     target[cols] = le.fit_transform(target[cols])
target.drop(columns=['Product Name','Customer Name','Sales'],inplace= True)

In [38]:
X = target.drop(columns=['Returned'])
y = target['Returned']
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

XGB_final = XGBClassifier(
    n_estimators=250,
    max_depth=25,
    learning_rate=0.3,
    eval_metric='logloss',
    random_state=42,
)
XGB_final.fit(x_train,y_train,
       eval_set =[(x_test,y_test)],
       verbose = False)
y_pred = XGB_final.predict(x_test)
acc = accuracy_score(y_test,y_pred)
print("Accuracy: ",acc)

Accuracy:  0.9514757378689345


In [57]:
LGBM=LGBMClassifier(
    verbose=-1,
    random_seed=42,
    learning_rate=0.21,
    max_depth=11,
    min_child_samples=10,
    n_estimators=370,
    num_leaves=60,
    subsample=0.7
)
LGBM.fit(x_train,y_train)
y_pred_Lgbm=LGBM.predict(x_test)
acc_LGBM=accuracy_score(y_test,y_pred_Lgbm)
print("Accuracy:",acc_LGBM)

Accuracy: 0.9609804902451226


In [11]:
RF_model = RandomForestClassifier(
    random_state=42,
    bootstrap=False, 
    max_depth=None,
    min_samples_leaf=1,
    min_samples_split=3,
    n_estimators=100,
    criterion = 'gini'
)
RF_model.fit(x_train,y_train)
y_pred_RF= RF_model.predict(x_test)
acc_RF=accuracy_score(y_test,y_pred_RF)
print("Accuracy",acc_RF)

Accuracy 0.951975987993997


In [10]:
CATB = CatBoostClassifier(random_seed=42,
                          border_count=32,
                          depth=9,
                          iterations=270,
                          learning_rate=0.73,
                          verbose=0
                         )
CATB.fit(x_train,y_train)
y_pred=CATB.predict(x_test)
acc_CAT=accuracy_score(y_pred,y_test)
print('CATBoost Accuracy: ',acc_CAT)

CATBoost Accuracy:  0.9564782391195598


In [46]:
base_learners=[
    ('RF',RandomForestClassifier(
        random_state=42,
        bootstrap=False, 
        max_depth=None,
        min_samples_leaf=1,
        min_samples_split=3,
        n_estimators=100,
        criterion = 'gini'
    )),
    ('XGB',XGBClassifier(
         n_estimators = 250, 
        max_depth= 25, 
        learning_rate= 0.3, 
        eval_metric='error',
        random_state=42,
        booster='gbtree'
    )),
    ('CB',CatBoostClassifier(
        verbose=0,
        iterations=300,
        random_seed=42,
        border_count=32,
        learning_rate=0.73,
        depth=9
    )),
    ('LGBM',LGBMClassifier(
        verbose=-1,
        random_seed=42,
        learning_rate=0.21,
        max_depth=11,
        min_child_samples=10,
        n_estimators=370,
        num_leaves=60,
        subsample=0.7
    ))
]
meta_learner=LGBMClassifier(
    random_seed=42,
    n_estimators=150,
    max_depth=3,
    learning_rate=0.07,
    num_leaves=12,
    subsample=0.7,
    min_child_samples=8
)

stack_model=StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=3,
    n_jobs=-1,
    passthrough=True
)
stack_model.fit(x_train,y_train)
y_pred=stack_model.predict(x_test)
acc_stack=accuracy_score(y_test,y_pred)
print("Accuracy Stack: ",acc_stack)

Accuracy Stack:  0.9629814907453726


In [54]:
cross_val = cross_val_score(stack_model, X, y, cv=5, scoring='accuracy')
print(f"Accuracy: {cross_val.mean():.4f} ± {cross_val.std():.4f}")

Accuracy: 0.9187 ± 0.0015
