In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score,root_mean_squared_error
from joblib import dump
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

pd.set_option('display.max_columns', None)

## Load the Data

In [2]:
df_train=pd.read_csv('../../data/train_cleaned_data.csv')
df_test=pd.read_csv('../../data/test_cleaned_data.csv')
df_train.head(5)

Unnamed: 0,Day_Count,Day,StoreID,ProductName,specials,Amt,Inflation_Percentage,Unemployment_Percentage
0,1,Tuesday,S001,Veg Burger,0,13125.0,4.7,1.7
1,1,Tuesday,S001,Fries,0,11250.0,4.7,1.7
2,1,Tuesday,S001,Coca-Cola,0,17680.0,4.7,1.7
3,1,Tuesday,S002,Cheese Burger,0,7800.0,4.7,1.7
4,1,Tuesday,S002,Fries,0,16275.0,4.7,1.7


### Group the Data According to Day_Cout and ProductName

Checking a theory as I want to sum the special in the grouped data, but summing specials means how many stores are offering Specials, not how many specials are sold

In [5]:
theory_check=df_train.groupby(['Day_Count', 'ProductName','StoreID']).agg({
    'specials': 'sum',
    'Amt': 'sum'
})
theory_check[theory_check['specials']>1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,specials,Amt
Day_Count,ProductName,StoreID,Unnamed: 3_level_1,Unnamed: 4_level_1


In [52]:

def aggregate_date(df):
    product_daily_sales = df.groupby(['Day_Count', 'ProductName']).agg({
        'Day': 'first',
        'specials': 'sum',  
        'Inflation_Percentage': 'first',
        'Unemployment_Percentage': 'first',
        'Amt': 'sum' 
    }).reset_index()

    
    product_daily_sales['Weekend'] = product_daily_sales['Day'].isin(['Friday','Saturday', 'Sunday']).astype(int)
    
    # product_daily_sales = pd.get_dummies(product_daily_sales, columns=['ProductName', 'Day'])
    return product_daily_sales

train_product_daily_sales=aggregate_date(df_train)
test_product_daily_sales=aggregate_date(df_test)
train_product_daily_sales.head(10)


Unnamed: 0,Day_Count,ProductName,Day,specials,Inflation_Percentage,Unemployment_Percentage,Amt,Weekend
0,1,Cheese Burger,Tuesday,0,4.7,1.7,28600.0,0
1,1,Chicken Burger,Tuesday,0,4.7,1.7,35530.0,0
2,1,Coca-Cola,Tuesday,0,4.7,1.7,69785.0,0
3,1,Fries,Tuesday,0,4.7,1.7,41850.0,0
4,1,Veg Burger,Tuesday,0,4.7,1.7,65175.15,0
5,2,Cheese Burger,Wednesday,0,4.7,1.7,21200.0,0
6,2,Chicken Burger,Wednesday,0,4.7,1.7,24990.0,0
7,2,Coca-Cola,Wednesday,0,4.7,1.7,49215.0,0
8,2,Custom,Wednesday,0,4.7,1.7,0.0,0
9,2,Fries,Wednesday,0,4.7,1.7,30600.0,0


#### Creating ColumnTransformer with OneHotEncoder to create a preprocessor to encode Categorical Data

In [53]:
encodable_cols = ['ProductName', 'Day']

preprocessor=ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), encodable_cols)
    ],
    remainder='passthrough'
)

train_product_daily_sales=preprocessor.fit_transform(train_product_daily_sales)
test_product_daily_sales=preprocessor.transform(test_product_daily_sales)

encoded_col_names = preprocessor.named_transformers_['cat'].get_feature_names_out(encodable_cols)

final_cols=list(encoded_col_names)+['Day_Count','specials', 'Inflation_Percentage', 'Unemployment_Percentage','Amt', 'Weekend']

train_product_daily_sales=pd.DataFrame(train_product_daily_sales, columns=final_cols)
test_product_daily_sales=pd.DataFrame(test_product_daily_sales, columns=final_cols)

train_product_daily_sales.head(10)

Unnamed: 0,ProductName_Cheese Burger,ProductName_Chicken Burger,ProductName_Coca-Cola,ProductName_Custom,ProductName_Falafel Burger,ProductName_Fries,ProductName_Veg Burger,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,Day_Count,specials,Inflation_Percentage,Unemployment_Percentage,Amt,Weekend
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,4.7,1.7,28600.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,4.7,1.7,35530.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,4.7,1.7,69785.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,4.7,1.7,41850.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,4.7,1.7,65175.15,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,4.7,1.7,21200.0,0.0
6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,4.7,1.7,24990.0,0.0
7,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,4.7,1.7,49215.0,0.0
8,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,4.7,1.7,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,4.7,1.7,30600.0,0.0


In [54]:
# train_product_daily_sales['specials'].unique()

In [55]:
# train_product_daily_sales['Store_Count'].unique()

In [56]:
# train_product_daily_sales[train_product_daily_sales['Store_Count']>3]

In [57]:
# avg_product_sales = df.groupby('ProductName')['Amt'].mean().to_dict()
# product_daily_sales['avg_product_price'] = product_daily_sales['ProductName'].map(avg_product_sales)

### Splitting the features and target

In [58]:
feature_cols = [col for col in train_product_daily_sales.columns if col not in ['Day_Count', 'Amt']]
x_train = train_product_daily_sales[feature_cols]
y_train = train_product_daily_sales['Amt']

x_test = test_product_daily_sales[feature_cols]
y_test = test_product_daily_sales['Amt']


### Model Training

In [59]:

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=3,
    random_state=42
)

rf_model.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,10
,min_samples_split,5
,min_samples_leaf,3
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Model Evaluation

In [60]:
y_pred = rf_model.predict(x_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:")
print(f"MAE: ${mae:.2f}")
print(f"R2 Score: {r2:.3f}")

Model Performance:
MAE: $1628.87
R2 Score: 0.990


In [61]:
# model would have learned that zero sales are coming from custom category of productname
custom_mask = y_test == 0  
non_zero_mae = mean_absolute_error(y_test[~custom_mask], y_pred[~custom_mask])
non_zero_r2 = r2_score(y_test[~custom_mask], y_pred[~custom_mask])

print(f"Performance excluding zero sales:") 
print(f"MAE: ${non_zero_mae:,.2f}")
print(f"R2: {non_zero_r2:.3f}")
print(f"Zero sales: {custom_mask.sum()} out of {len(y_test)}")

Performance excluding zero sales:
MAE: $1,776.55
R2: 0.985
Zero sales: 103 out of 1239


In [62]:
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE: ${rmse:,.2f}")
print(f"RMSE/MAE ratio: {rmse/1788.93:.2f}") 

RMSE: $2,306.20
RMSE/MAE ratio: 1.29


The ratio is under 1.5 so it is acceptable.

## Saving the Model and Preprocessor

In [63]:
dump(rf_model, 'rf_model.pkl')

['rf_model.pkl']

In [64]:
dump(preprocessor, 'preprocessor.pkl')

['preprocessor.pkl']