In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import joblib


In [2]:
df = pd.read_csv("engineered_retail_data.csv")
df.head()


Unnamed: 0,Order_ID,Order_Date,Quantity,Price,Discount,Sales,Store_S2,Store_S3,Category_Electronics,Category_Furniture
0,1,2023-01-01,2,20000,0.1,36000,0,0,1,0
1,2,2023-01-02,3,1500,0.05,4275,1,0,0,0
2,3,2023-01-03,1,12000,0.15,10200,0,1,0,1
3,4,2023-01-04,4,1800,0.0,7200,0,0,0,0
4,5,2023-01-05,2,22000,0.2,35200,1,0,1,0


In [3]:
X = df.drop(columns=["Sales", "Order_Date"])
y = df["Sales"]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [5]:
def evaluate(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return mae, rmse


In [6]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "Decision Tree": DecisionTreeRegressor(max_depth=5, random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}


In [7]:
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    mae, rmse = evaluate(y_test, preds)
    results.append([name, mae, rmse])


In [8]:
results_df = pd.DataFrame(
    results, columns=["Model", "MAE", "RMSE"]
).sort_values("RMSE")

results_df


Unnamed: 0,Model,MAE,RMSE
3,Decision Tree,0.0,0.0
0,Linear Regression,1.250555e-09,1.38598e-09
5,Gradient Boosting,0.4107056,0.4125153
2,Lasso Regression,7.291482,7.328536
1,Ridge Regression,1619.644,1658.541
4,Random Forest,1405.875,1675.709


In [9]:
rf = models["Random Forest"]

train_rmse = np.sqrt(mean_squared_error(y_train, rf.predict(X_train)))
test_rmse = np.sqrt(mean_squared_error(y_test, rf.predict(X_test)))

print("Train RMSE:", train_rmse)
print("Test RMSE :", test_rmse)


Train RMSE: 1606.5055803474509
Test RMSE : 1675.7089637672766


In [10]:
feature_importance = pd.Series(
    rf.feature_importances_, index=X.columns
).sort_values(ascending=False)

feature_importance


Price                   0.494128
Category_Electronics    0.363582
Discount                0.062476
Quantity                0.030987
Store_S3                0.022644
Order_ID                0.020647
Category_Furniture      0.003640
Store_S2                0.001897
dtype: float64

In [12]:
os.makedirs("../models", exist_ok=True)

joblib.dump(rf, "../models/best_sales_model.pkl")

print("✅ Model saved successfully")


✅ Model saved successfully
