## ***XGBoost Classifier***

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("C:\\Users\\default.DESKTOP-GAN0M7C\\my_projects\\data\\amazon data\\Amazon Sale Report.csv")

  data = pd.read_csv("C:\\Users\\default.DESKTOP-GAN0M7C\\my_projects\\data\\amazon data\\Amazon Sale Report.csv")


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128975 entries, 0 to 128974
Data columns (total 24 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   index               128975 non-null  int64  
 1   Order ID            128975 non-null  object 
 2   Date                128975 non-null  object 
 3   Status              128975 non-null  object 
 4   Fulfilment          128975 non-null  object 
 5   Sales Channel       128975 non-null  object 
 6   ship-service-level  128975 non-null  object 
 7   Style               128975 non-null  object 
 8   SKU                 128975 non-null  object 
 9   Category            128975 non-null  object 
 10  Size                128975 non-null  object 
 11  ASIN                128975 non-null  object 
 12  Courier Status      122103 non-null  object 
 13  Qty                 128975 non-null  int64  
 14  currency            121180 non-null  object 
 15  Amount              121180 non-nul

### Data Cleaning & Transformations

In [4]:
def preprocess_orders(data: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess the orders dataset:
    - Drop irrelevant or high-cardinality columns
    - Rename columns if needed
    - Extract useful features from 'Date'
    - Create derived features like day, month, weekday, weekend flag
    """
    
    # 1. Drop unwanted columns if they exist
    DROP_COLS = [
        "index",
        "Order ID",           # pure identifier
        "SKU",                # high cardinality ID
        "promotion-ids",      # sparse / leakage-prone
        "Unnamed: 22",
        "ASIN",
        "fulfilled-by",
        "ship-postal-code",
        "currency",
        "Size"
    ]
    
    data = data.drop(columns=[c for c in DROP_COLS if c in data.columns])
    
    # 2. Rename columns if necessary
    if 'Sales Channel ' in data.columns:
        data.rename(columns={'Sales Channel ':'Sales Channel'}, inplace=True)
    
    # 3. Convert 'Date' to datetime safely
    if 'Date' in data.columns:
        data["Date"] = pd.to_datetime(data["Date"], dayfirst=True, errors="coerce")
        
        # Extract date-related features
        data["order_day"] = data["Date"].dt.day_name()
        data["order_month"] = data["Date"].dt.month_name()
        data["order_weekday"] = data["Date"].dt.weekday
        data["is_weekend"] = data["order_weekday"].isin([5,6]).astype(int)
        
        # Drop original Date column
        data = data.drop(columns=["Date"])
    
    return data


In [5]:
data = preprocess_orders(data)
print(data.head())

                         Status Fulfilment Sales Channel ship-service-level  \
0                     Cancelled   Merchant     Amazon.in           Standard   
1  Shipped - Delivered to Buyer   Merchant     Amazon.in           Standard   
2                       Shipped     Amazon     Amazon.in          Expedited   
3                     Cancelled   Merchant     Amazon.in           Standard   
4                       Shipped     Amazon     Amazon.in          Expedited   

     Style       Category Courier Status  Qty  Amount    ship-city  \
0   SET389            Set            NaN    0  647.62       MUMBAI   
1  JNE3781          kurta        Shipped    1  406.00    BENGALURU   
2  JNE3371          kurta        Shipped    1  329.00  NAVI MUMBAI   
3    J0341  Western Dress            NaN    0  753.33   PUDUCHERRY   
4  JNE3671            Top        Shipped    1  574.00      CHENNAI   

    ship-state ship-country    B2B order_day order_month  order_weekday  \
0  MAHARASHTRA           IN  

  data["Date"] = pd.to_datetime(data["Date"], dayfirst=True, errors="coerce")


In [6]:
data.Amount.max()

5584.0

In [7]:
data.Amount.min()

0.0

In [8]:
# create 3 classes using quantiles (balanced classes)
data["sales_class"] = pd.qcut(
    data["Amount"],
    q=3,
    labels=["Low", "Medium", "High"]
)

# check distribution
data["sales_class"].value_counts()


sales_class
Medium    41555
Low       40498
High      39127
Name: count, dtype: int64

Quantiles gives -> balanced classes, which avoids the avoids class imbalance

Standard practice for turning regression -> classification

### Feature Engineering

In [13]:
# seperating numerical and categorical columns
numerical_cols = data.select_dtypes(include="number").columns
print("numerical_cols:\n", numerical_cols)
print()

categorical_cols = data.select_dtypes(exclude="number").columns
print("catgeorical_cols:\n", categorical_cols)

numerical_cols:
 Index(['Qty', 'Amount', 'order_weekday', 'is_weekend'], dtype='object')

catgeorical_cols:
 Index(['Status', 'Fulfilment', 'Sales Channel', 'ship-service-level', 'Style',
       'Category', 'Courier Status', 'ship-city', 'ship-state', 'ship-country',
       'B2B', 'order_day', 'order_month', 'sales_class'],
      dtype='object')


In [14]:
# drop Amount, as it is directly influencing the target
numerical_cols = numerical_cols.drop(['Amount'])
numerical_cols

Index(['Qty', 'order_weekday', 'is_weekend'], dtype='object')

In [15]:
# dropping the target variable
categorical_cols = categorical_cols.drop(['sales_class'])
categorical_cols

Index(['Status', 'Fulfilment', 'Sales Channel', 'ship-service-level', 'Style',
       'Category', 'Courier Status', 'ship-city', 'ship-state', 'ship-country',
       'B2B', 'order_day', 'order_month'],
      dtype='object')

In [12]:
TARGET = "sales_class"

# drop rows with missing target (best practice)
data = data.dropna(subset=[TARGET])

# seperating the input and target variables
X = data.drop(columns=[TARGET, 'Amount'])
y = data[TARGET]


In [16]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_enc = le.fit_transform(y)


In [17]:
X.shape

(121180, 16)

In [18]:
X.isna().sum()

Status                   0
Fulfilment               0
Sales Channel            0
ship-service-level       0
Style                    0
Category                 0
Courier Status        5136
Qty                      0
ship-city               31
ship-state              31
ship-country            31
B2B                      0
order_day                0
order_month              0
order_weekday            0
is_weekend               0
dtype: int64

In [19]:
y.shape

(121180,)

In [21]:
# as all null values in target are dropped
y.isna().sum()


np.int64(0)

In [23]:
# splitting data into training and testing data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc,
    test_size=0.2,
    stratify=y_enc,
    random_state=42
)



In [24]:
# minimum preprocessing for CatBoost

from sklearn.impute import SimpleImputer

# numerical
num_imputer = SimpleImputer(strategy="median")
X_train[numerical_cols] = num_imputer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = num_imputer.transform(X_test[numerical_cols])

# categorical
cat_imputer = SimpleImputer(strategy="most_frequent")
X_train[categorical_cols] = cat_imputer.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = cat_imputer.transform(X_test[categorical_cols])

# categorical feature indices (IMPORTANT)
cat_feature_indices = [X_train.columns.get_loc(col) for col in categorical_cols]


In [25]:
# sampling -> optional

X_train_s, _, y_train_s, _ = train_test_split(
    X_train, y_train,
    test_size=0.75,
    stratify=y_train,
    random_state=42
)



In [27]:
X_test.isna().sum()

Status                0
Fulfilment            0
Sales Channel         0
ship-service-level    0
Style                 0
Category              0
Courier Status        0
Qty                   0
ship-city             0
ship-state            0
ship-country          0
B2B                   0
order_day             0
order_month           0
order_weekday         0
is_weekend            0
dtype: int64

In [28]:
X_train.shape

(96944, 16)

### Model Training 

In [29]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report

cat_clf = CatBoostClassifier(
    iterations=500,
    depth=8,
    learning_rate=0.1,
    loss_function="MultiClass",
    eval_metric="TotalF1",
    random_seed=42,
    verbose=100
)

cat_clf.fit(
    X_train, y_train,
    cat_features=cat_feature_indices
)

y_pred = cat_clf.predict(X_test)

print(classification_report(y_test, y_pred, target_names=le.classes_))


0:	learn: 0.9121683	total: 680ms	remaining: 5m 39s
100:	learn: 0.9505182	total: 1m 7s	remaining: 4m 25s
200:	learn: 0.9527747	total: 2m 16s	remaining: 3m 22s
300:	learn: 0.9542899	total: 3m 24s	remaining: 2m 15s
400:	learn: 0.9557399	total: 4m 34s	remaining: 1m 7s
499:	learn: 0.9572333	total: 5m 42s	remaining: 0us
              precision    recall  f1-score   support

        High       0.97      0.97      0.97      7825
         Low       0.98      0.95      0.97      8100
      Medium       0.94      0.97      0.95      8311

    accuracy                           0.96     24236
   macro avg       0.96      0.96      0.96     24236
weighted avg       0.96      0.96      0.96     24236



this model actually provens the outstanding among all tried models.

by search the best parameters we can improve this even much more!