## ***Random Forest Classifier***

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("C:\\Users\\default.DESKTOP-GAN0M7C\\my_projects\\data\\amazon data\\Amazon Sale Report.csv")

  data = pd.read_csv("C:\\Users\\default.DESKTOP-GAN0M7C\\my_projects\\data\\amazon data\\Amazon Sale Report.csv")


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128975 entries, 0 to 128974
Data columns (total 24 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   index               128975 non-null  int64  
 1   Order ID            128975 non-null  object 
 2   Date                128975 non-null  object 
 3   Status              128975 non-null  object 
 4   Fulfilment          128975 non-null  object 
 5   Sales Channel       128975 non-null  object 
 6   ship-service-level  128975 non-null  object 
 7   Style               128975 non-null  object 
 8   SKU                 128975 non-null  object 
 9   Category            128975 non-null  object 
 10  Size                128975 non-null  object 
 11  ASIN                128975 non-null  object 
 12  Courier Status      122103 non-null  object 
 13  Qty                 128975 non-null  int64  
 14  currency            121180 non-null  object 
 15  Amount              121180 non-nul

### Data Cleaning & Transformations

In [4]:
def preprocess_orders(data: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess the orders dataset:
    - Drop irrelevant or high-cardinality columns
    - Rename columns if needed
    - Extract useful features from 'Date'
    - Create derived features like day, month, weekday, weekend flag
    """
    
    # 1. Drop unwanted columns if they exist
    DROP_COLS = [
        "index",
        "Order ID",           # pure identifier
        "SKU",                # high cardinality ID
        "promotion-ids",      # sparse / leakage-prone
        "Unnamed: 22",
        "ASIN",
        "fulfilled-by",
        "ship-postal-code",
        "currency",
        "Size"
    ]
    
    data = data.drop(columns=[c for c in DROP_COLS if c in data.columns])
    
    # 2. Rename columns if necessary
    if 'Sales Channel ' in data.columns:
        data.rename(columns={'Sales Channel ':'Sales Channel'}, inplace=True)
    
    # 3. Convert 'Date' to datetime safely
    if 'Date' in data.columns:
        data["Date"] = pd.to_datetime(data["Date"], dayfirst=True, errors="coerce")
        
        # Extract date-related features
        data["order_day"] = data["Date"].dt.day_name()
        data["order_month"] = data["Date"].dt.month_name()
        data["order_weekday"] = data["Date"].dt.weekday
        data["is_weekend"] = data["order_weekday"].isin([5,6]).astype(int)
        
        # Drop original Date column
        data = data.drop(columns=["Date"])
    
    return data


In [5]:
data = preprocess_orders(data)
print(data.head())

                         Status Fulfilment Sales Channel ship-service-level  \
0                     Cancelled   Merchant     Amazon.in           Standard   
1  Shipped - Delivered to Buyer   Merchant     Amazon.in           Standard   
2                       Shipped     Amazon     Amazon.in          Expedited   
3                     Cancelled   Merchant     Amazon.in           Standard   
4                       Shipped     Amazon     Amazon.in          Expedited   

     Style       Category Courier Status  Qty  Amount    ship-city  \
0   SET389            Set            NaN    0  647.62       MUMBAI   
1  JNE3781          kurta        Shipped    1  406.00    BENGALURU   
2  JNE3371          kurta        Shipped    1  329.00  NAVI MUMBAI   
3    J0341  Western Dress            NaN    0  753.33   PUDUCHERRY   
4  JNE3671            Top        Shipped    1  574.00      CHENNAI   

    ship-state ship-country    B2B order_day order_month  order_weekday  \
0  MAHARASHTRA           IN  

  data["Date"] = pd.to_datetime(data["Date"], dayfirst=True, errors="coerce")


In [6]:
data.Amount.max()

5584.0

In [7]:
data.Amount.min()

0.0

In [8]:
# create 3 classes using quantiles (balanced classes)
data["sales_class"] = pd.qcut(
    data["Amount"],
    q=3,
    labels=["Low", "Medium", "High"]
)

# check distribution
data["sales_class"].value_counts()


sales_class
Medium    41555
Low       40498
High      39127
Name: count, dtype: int64

Quantiles gives -> balanced classes, which avoids the avoids class imbalance

Standard practice for turning regression -> classification

### Feature Engineering

In [9]:
# seperating numerical and categorical columns
numerical_cols = data.select_dtypes(include="number").columns
print("numerical_cols:\n", numerical_cols)
print()

categorical_cols = data.select_dtypes(exclude="number").columns
print("catgeorical_cols:\n", categorical_cols)

numerical_cols:
 Index(['Qty', 'Amount', 'order_weekday', 'is_weekend'], dtype='object')

catgeorical_cols:
 Index(['Status', 'Fulfilment', 'Sales Channel', 'ship-service-level', 'Style',
       'Category', 'Courier Status', 'ship-city', 'ship-state', 'ship-country',
       'B2B', 'order_day', 'order_month', 'sales_class'],
      dtype='object')


In [10]:
# drop Amount, as it is directly influencing the target
numerical_cols = numerical_cols.drop(['Amount'])
numerical_cols

Index(['Qty', 'order_weekday', 'is_weekend'], dtype='object')

In [11]:
# dropping the target variable
categorical_cols = categorical_cols.drop(['sales_class'])
categorical_cols

Index(['Status', 'Fulfilment', 'Sales Channel', 'ship-service-level', 'Style',
       'Category', 'Courier Status', 'ship-city', 'ship-state', 'ship-country',
       'B2B', 'order_day', 'order_month'],
      dtype='object')

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from category_encoders.count import CountEncoder

def build_tree_preprocessor(num_cols, cat_cols):
    """
    Preprocessor for:
    - DecisionTree
    - RandomForest
    - AdaBoost
    - GradientBoosting
    """
    
    num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median"))
    ])
    
    cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", CountEncoder(normalize=True))
    ])
    
    return ColumnTransformer([
        ("num", num_pipeline, num_cols),
        ("cat", cat_pipeline, cat_cols)
    ])


In [13]:
preprocessor = build_tree_preprocessor(numerical_cols, categorical_cols)
preprocessor

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_unknown,'value'
,handle_missing,'value'
,min_group_size,
,combine_min_nan_groups,True
,min_group_name,
,normalize,True


In [14]:
TARGET = "sales_class"

# drop rows with missing target (best practice)
data = data.dropna(subset=[TARGET])

# seperating the input and target variables
X = data.drop(columns=[TARGET, 'Amount'])
y = data[TARGET]


In [15]:
X.shape

(121180, 16)

In [16]:
X.isna().sum()

Status                   0
Fulfilment               0
Sales Channel            0
ship-service-level       0
Style                    0
Category                 0
Courier Status        5136
Qty                      0
ship-city               31
ship-state              31
ship-country            31
B2B                      0
order_day                0
order_month              0
order_weekday            0
is_weekend               0
dtype: int64

In [17]:
y.shape

(121180,)

In [18]:
# as all null values in target are dropped
y.isna().sum()


np.int64(0)

In [19]:
# converting the target labels into the real numbers 
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_enc = le.fit_transform(y)


In [20]:
# splitting data into training and testing data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc,
    test_size=0.2,
    stratify=y_enc,
    random_state=42
)



In [21]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("Train shape:", X_train_processed.shape)
print("Test shape:", X_test_processed.shape)


Train shape: (96944, 16)
Test shape: (24236, 16)


In [22]:
# sampling -> optional

X_train_s, _, y_train_s, _ = train_test_split(
    X_train_processed, y_train,
    test_size=0.75,
    stratify=y_train,
    random_state=42
)



In [23]:
X_train.shape

(96944, 16)

In [24]:
np.isnan(X_train_processed).sum()


np.int64(0)

### Model Training 

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1. Build Random Forest classifier
rf_clf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1
)

# 2. Train on sampled processed data (fast prototyping)
rf_clf.fit(X_train_processed, y_train)

# 3. Predict on test set
y_pred = rf_clf.predict(X_test_processed)

# 4. Evaluate
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[6353  416 1056]
 [ 379 6740  981]
 [1659 1598 5054]]

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.81      0.78      7825
           1       0.77      0.83      0.80      8100
           2       0.71      0.61      0.66      8311

    accuracy                           0.75     24236
   macro avg       0.75      0.75      0.75     24236
weighted avg       0.75      0.75      0.75     24236

