In [1]:
import pandas as pd
import numpy as np


In [2]:
data = pd.read_csv("C:\\Users\\user\\Downloads\\Amazon Sale Report.csv\\Amazon Sale Report.csv")

  data = pd.read_csv("C:\\Users\\user\\Downloads\\Amazon Sale Report.csv\\Amazon Sale Report.csv")


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128975 entries, 0 to 128974
Data columns (total 24 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   index               128975 non-null  int64  
 1   Order ID            128975 non-null  object 
 2   Date                128975 non-null  object 
 3   Status              128975 non-null  object 
 4   Fulfilment          128975 non-null  object 
 5   Sales Channel       128975 non-null  object 
 6   ship-service-level  128975 non-null  object 
 7   Style               128975 non-null  object 
 8   SKU                 128975 non-null  object 
 9   Category            128975 non-null  object 
 10  Size                128975 non-null  object 
 11  ASIN                128975 non-null  object 
 12  Courier Status      122103 non-null  object 
 13  Qty                 128975 non-null  int64  
 14  currency            121180 non-null  object 
 15  Amount              121180 non-nul

## Data Transformation

In [4]:
# first drop the most unwanted columns
DROP_COLS = [
    "index",
    "Order ID",           # pure identifier
    "SKU",                # high cardinality ID
    "promotion-ids",      # sparse + leakage prone
    "Unnamed: 22",
    "ASIN",
    "fulfilled-by",
    "ship-postal-code",
    "currency",
    "Size"
]

data = data.drop(columns=[c for c in DROP_COLS if c in data.columns])


In [5]:
# rename some columns
data.rename(columns={'Sales Channel ':'Sales Channel'}, inplace=True)

In [6]:
data.columns

Index(['Date', 'Status', 'Fulfilment', 'Sales Channel', 'ship-service-level',
       'Style', 'Category', 'Courier Status', 'Qty', 'Amount', 'ship-city',
       'ship-state', 'ship-country', 'B2B'],
      dtype='object')

In [7]:
# extract some important columsn from existing columns
data["Date"] = pd.to_datetime(data["Date"], dayfirst=True, errors="coerce")

data["order_day"] = data["Date"].dt.day_name()
data["order_month"] = data["Date"].dt.month_name()
data["order_weekday"] = data["Date"].dt.weekday
data["is_weekend"] = data["order_weekday"].isin([5,6]).astype(int)


# again drop the irrelevant columns
data = data.drop(columns=["Date"])


  data["Date"] = pd.to_datetime(data["Date"], dayfirst=True, errors="coerce")


## Feature Engineering

In [8]:
# seperating numerical and categorical columns
numerical_cols = data.select_dtypes(include="number").columns
print("numerical_cols:\n", numerical_cols)
print()

categorical_cols = data.select_dtypes(exclude="number").columns
print("catgeorical_cols:\n", categorical_cols)

numerical_cols:
 Index(['Qty', 'Amount', 'order_weekday', 'is_weekend'], dtype='object')

catgeorical_cols:
 Index(['Status', 'Fulfilment', 'Sales Channel', 'ship-service-level', 'Style',
       'Category', 'Courier Status', 'ship-city', 'ship-state', 'ship-country',
       'B2B', 'order_day', 'order_month'],
      dtype='object')


In [9]:
numerical_cols = numerical_cols.drop(['Amount'])
numerical_cols

Index(['Qty', 'order_weekday', 'is_weekend'], dtype='object')

In [10]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler


In [11]:

# build a function -> general purpose -> to handle ml preprocessings
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

def build_preprocessor(num_cols, cat_cols, scale_numeric=True):
    
    num_steps = [
        ("imputer", SimpleImputer(strategy="median"))
    ]
    
    if scale_numeric:
        num_steps.append(("scaler", StandardScaler()))
    
    num_pipeline = Pipeline(num_steps)

    cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    return ColumnTransformer([
        ("num", num_pipeline, num_cols),
        ("cat", cat_pipeline, cat_cols)
    ])


In [12]:
# for linear models -> scaling is on!
preprocessor = build_preprocessor(
    num_cols=numerical_cols,
    cat_cols=categorical_cols,
    scale_numeric=True
)

In [13]:

TARGET = "Amount"   # regression target

X = data.drop(columns=[TARGET])
y = data[TARGET]


In [14]:
# fill target with median values
y.fillna(value=data['Amount'].median(), inplace=True)


In [15]:
X.shape

(128975, 16)

In [16]:
y.shape

(128975,)

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


In [18]:
X_train.shape

(103180, 16)

In [19]:
y_train.shape

(103180,)

In [20]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [21]:
X_train_processed.shape

(103180, 9470)

as the original training data is too heavy, so for prototyping and learning purpose, we can take samples to get actual insightsusing short data .

so, sample data -> train model -> real insights

In [22]:
# create bins for stratification (quantiles work best)
y_bins = pd.qcut(y_train, q=10, duplicates="drop")


In [23]:
X_train_sampled, _, y_train_sampled, _ = train_test_split(
    X_train_processed,
    y_train,
    test_size=0.9,          # keep 10% for fast training
    stratify=y_bins,
    random_state=42
)


In [24]:
X_train_sampled.shape

(10318, 9470)

## Model Training

In [26]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [27]:
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)  # RBF kernel is default
svr.fit(X_train_sampled, y_train_sampled)


In [30]:
y_pred = svr.predict(X_test_processed)

In [31]:

mae = mean_squared_error(y_test, y_pred)
print("MAE: ", mae)

mse = mean_squared_error(y_test, y_pred)
print("MSE: ", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(y_test, y_pred)
print("R2:", r2)


MAE:  62239.51384761582
MSE:  62239.51384761582
RMSE: 249.47848373680608
R2: 0.17801281835999305


In [32]:
y_pred = svr.predict(X_test_processed)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("R2:", r2)
print("RMSE:", rmse)


R2: 0.17801281835999305
RMSE: 249.47848373680608


High-dimensional, sparse OHE features -> SVR struggles

SVR works better on smaller, dense, numeric datasets

For this data, Ridge/Lasso or tree-based models are usually better.