In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("C:\\Users\\user\\Downloads\\Amazon Sale Report.csv\\Amazon Sale Report.csv")

  data = pd.read_csv("C:\\Users\\user\\Downloads\\Amazon Sale Report.csv\\Amazon Sale Report.csv")


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128975 entries, 0 to 128974
Data columns (total 24 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   index               128975 non-null  int64  
 1   Order ID            128975 non-null  object 
 2   Date                128975 non-null  object 
 3   Status              128975 non-null  object 
 4   Fulfilment          128975 non-null  object 
 5   Sales Channel       128975 non-null  object 
 6   ship-service-level  128975 non-null  object 
 7   Style               128975 non-null  object 
 8   SKU                 128975 non-null  object 
 9   Category            128975 non-null  object 
 10  Size                128975 non-null  object 
 11  ASIN                128975 non-null  object 
 12  Courier Status      122103 non-null  object 
 13  Qty                 128975 non-null  int64  
 14  currency            121180 non-null  object 
 15  Amount              121180 non-nul

### Data Cleaning & Transformation

In [4]:

def preprocess_orders(data: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess the orders dataset:
    - Drop irrelevant or high-cardinality columns
    - Rename columns if needed
    - Extract useful features from 'Date'
    - Create derived features like day, month, weekday, weekend flag
    """
    
    # 1️⃣ Drop unwanted columns if they exist
    DROP_COLS = [
        "index",
        "Order ID",           # pure identifier
        "SKU",                # high cardinality ID
        "promotion-ids",      # sparse / leakage-prone
        "Unnamed: 22",
        "ASIN",
        "fulfilled-by",
        "ship-postal-code",
        "currency",
        "Size"
    ]
    
    data = data.drop(columns=[c for c in DROP_COLS if c in data.columns])
    
    # 2️⃣ Rename columns if necessary
    if 'Sales Channel ' in data.columns:
        data.rename(columns={'Sales Channel ':'Sales Channel'}, inplace=True)
    
    # 3️⃣ Convert 'Date' to datetime safely
    if 'Date' in data.columns:
        data["Date"] = pd.to_datetime(data["Date"], dayfirst=True, errors="coerce")
        
        # Extract date-related features
        data["order_day"] = data["Date"].dt.day_name()
        data["order_month"] = data["Date"].dt.month_name()
        data["order_weekday"] = data["Date"].dt.weekday
        data["is_weekend"] = data["order_weekday"].isin([5,6]).astype(int)
        
        # Drop original Date column
        data = data.drop(columns=["Date"])
    
    return data


In [5]:
data = preprocess_orders(data)
print(data.head())

  data["Date"] = pd.to_datetime(data["Date"], dayfirst=True, errors="coerce")


                         Status Fulfilment Sales Channel ship-service-level  \
0                     Cancelled   Merchant     Amazon.in           Standard   
1  Shipped - Delivered to Buyer   Merchant     Amazon.in           Standard   
2                       Shipped     Amazon     Amazon.in          Expedited   
3                     Cancelled   Merchant     Amazon.in           Standard   
4                       Shipped     Amazon     Amazon.in          Expedited   

     Style       Category Courier Status  Qty  Amount    ship-city  \
0   SET389            Set            NaN    0  647.62       MUMBAI   
1  JNE3781          kurta        Shipped    1  406.00    BENGALURU   
2  JNE3371          kurta        Shipped    1  329.00  NAVI MUMBAI   
3    J0341  Western Dress            NaN    0  753.33   PUDUCHERRY   
4  JNE3671            Top        Shipped    1  574.00      CHENNAI   

    ship-state ship-country    B2B order_day order_month  order_weekday  \
0  MAHARASHTRA           IN  

### Feature Engineering

In [6]:
# seperating numerical and categorical columns
numerical_cols = data.select_dtypes(include="number").columns
print("numerical_cols:\n", numerical_cols)
print()

categorical_cols = data.select_dtypes(exclude="number").columns
print("catgeorical_cols:\n", categorical_cols)

numerical_cols:
 Index(['Qty', 'Amount', 'order_weekday', 'is_weekend'], dtype='object')

catgeorical_cols:
 Index(['Status', 'Fulfilment', 'Sales Channel', 'ship-service-level', 'Style',
       'Category', 'Courier Status', 'ship-city', 'ship-state', 'ship-country',
       'B2B', 'order_day', 'order_month'],
      dtype='object')


In [7]:
numerical_cols = numerical_cols.drop(['Amount'])
numerical_cols

Index(['Qty', 'order_weekday', 'is_weekend'], dtype='object')

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from category_encoders.count import CountEncoder

def build_tree_preprocessor(num_cols, cat_cols):
    
    num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median"))
        # no scaling for trees
    ])

    cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", CountEncoder(normalize=True))  # frequency encoding
    ])

    return ColumnTransformer([
        ("num", num_pipeline, num_cols),
        ("cat", cat_pipeline, cat_cols)
    ])


In [9]:
# for linear models -> scaling is on!
preprocessor = build_tree_preprocessor(
    num_cols=numerical_cols,
    cat_cols=categorical_cols,
)

In [10]:
preprocessor

In [12]:

# seperating the input and target variable from data
TARGET = "Amount"   # regression target

X = data.drop(columns=[TARGET])
y = data[TARGET]


In [13]:
# fill target with median values
y.fillna(value=data['Amount'].median(), inplace=True)


In [14]:
X.shape

(128975, 16)

In [15]:
y.shape

(128975,)

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


In [17]:
X_train.shape

(103180, 16)

In [18]:
y_train.shape

(103180,)

In [19]:
# apply preprocessor and take the sample records

def process_and_sample(X_train, X_test, y_train, sample_frac=0.1, n_bins=10, preprocessor=None, random_state=42):
    # 1. Fit preprocessor on full X_train
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    
    # 2. Create quantile bins for stratified sampling
    y_bins = pd.qcut(y_train, q=n_bins, duplicates="drop")
    
    # 3. Stratified sampling
    X_train_sampled, _, y_train_sampled, _ = train_test_split(
        X_train_processed,
        y_train,
        test_size=1-sample_frac,   # keep fraction for training
        stratify=y_bins,
        random_state=random_state
    )
    
    return X_train_processed, X_test_processed, X_train_sampled, y_train_sampled


In [20]:
X_train_processed, X_test_processed, X_train_sampled, y_train_sampled = process_and_sample(
    X_train, X_test, y_train, sample_frac=0.1, preprocessor=preprocessor
)

print("Full X_train_processed shape:", X_train_processed.shape)
print("Sampled X_train_sampled shape:", X_train_sampled.shape)


Full X_train_processed shape: (103180, 16)
Sampled X_train_sampled shape: (10318, 16)


### Model Training

In [22]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


In [23]:
ada = AdaBoostRegressor(
    n_estimators=300,
    learning_rate=0.05,
    random_state=42
)

ada.fit(X_train_sampled, y_train_sampled)


In [24]:
y_pred = ada.predict(X_test_processed)


In [25]:

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2:", r2)


MAE: 156.2479183469991
MSE: 46856.94582374051
RMSE: 216.46465259653945
R2: 0.3811679035248383


so, ***AdaBoost is not suitable here!***

we can try other boosting algorithms to improve the model performance.

In [26]:
ada = AdaBoostRegressor(
    n_estimators=300,
    learning_rate=0.05,
    random_state=42
)

ada.fit(X_train_processed, y_train)


In [27]:
y_pred = ada.predict(X_test_processed)


In [28]:

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2:", r2)

MAE: 163.85376781994506
MSE: 48590.33205087038
RMSE: 220.43214840596727
R2: 0.358275352290898


From the comparison of metrics between sampled and full training data, we can observe that training on a representative sample provides reasonably close performance while significantly reducing training time.

This makes sampling a practical and effective approach for rapid prototyping, experimentation, and learning, allowing faster iteration without a major loss in model quality.

For final production models where maximum accuracy is required, training on the full dataset is preferable, but for early-stage development and analysis, sampled training is both efficient and reliable.
