In [6]:
import pandas as pd
import numpy as np

# Visualization (optional but useful)
import matplotlib.pyplot as plt
import seaborn as sns

# Model selection & evaluation
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Baseline & ML models
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Pipeline & preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Model saving
import joblib


In [7]:
# Load engineered dataset
df = pd.read_csv("engineered_retail_data.csv")

# Quick check
df.head()


Unnamed: 0,Order_ID,Order_Date,Quantity,Price,Discount,Sales,Store_ID_S2,Store_ID_S3,Product_Category_Electronics,Product_Category_Furniture
0,1,2023-01-01,2,20000,0.1,36000,0,0,1,0
1,2,2023-01-02,3,1500,0.05,4275,1,0,0,0
2,3,2023-01-03,1,12000,0.15,10200,0,1,0,1
3,4,2023-01-04,4,1800,0.0,7200,0,0,0,0
4,5,2023-01-05,2,22000,0.2,35200,1,0,1,0


In [8]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Order_ID                      10 non-null     int64  
 1   Order_Date                    10 non-null     object 
 2   Quantity                      10 non-null     int64  
 3   Price                         10 non-null     int64  
 4   Discount                      10 non-null     float64
 5   Sales                         10 non-null     int64  
 6   Store_ID_S2                   10 non-null     int64  
 7   Store_ID_S3                   10 non-null     int64  
 8   Product_Category_Electronics  10 non-null     int64  
 9   Product_Category_Furniture    10 non-null     int64  
dtypes: float64(1), int64(8), object(1)
memory usage: 932.0+ bytes


In [9]:
X = df.drop(columns=["Sales", "Order_Date"])
y = df["Sales"]


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((8, 8), (2, 8))

In [11]:
baseline = DummyRegressor(strategy="mean")
baseline.fit(X_train, y_train)

baseline_preds = baseline.predict(X_test)

baseline_mae = mean_absolute_error(y_test, baseline_preds)
baseline_rmse = np.sqrt(mean_squared_error(y_test, baseline_preds))
baseline_mape = np.mean(np.abs((y_test - baseline_preds) / y_test)) * 100

print("Baseline MAE :", baseline_mae)
print("Baseline RMSE:", baseline_rmse)
print("Baseline MAPE:", baseline_mape)


Baseline MAE : 16046.875
Baseline RMSE: 16113.382745892464
Baseline MAPE: 306.06839364035085


In [12]:
lr = LinearRegression()
lr.fit(X_train, y_train)

lr_preds = lr.predict(X_test)

lr_mae = mean_absolute_error(y_test, lr_preds)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_preds))
lr_mape = np.mean(np.abs((y_test - lr_preds) / y_test)) * 100

print("Linear Regression MAE :", lr_mae)
print("Linear Regression RMSE:", lr_rmse)
print("Linear Regression MAPE:", lr_mape)


Linear Regression MAE : 1.1514202924445271e-09
Linear Regression RMSE: 1.1552988684138897e-09
Linear Regression MAPE: 2.1013449589866186e-11


In [16]:
joblib.dump(lr, "final_sales_model.pkl")
print("✅ Model saved successfully")


✅ Model saved successfully
