In [None]:
#import required libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.svm import SVC
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('train.csv')
#Basic Inspection
display(df.head())
display(df.tail())
df.describe()


In [None]:
#feature engineering
parts = df["date"].str.split("-", n = 3, expand = True)
df["year"]= parts[0].astype('int')
df["month"]= parts[1].astype('int')
df["day"]= parts[2].astype('int')
df.head()

In [None]:
#weekend or a weekday
from datetime import datetime

def weekend_or_weekday(year, month, day):
    d = datetime(year, month, day)
    return 1 if d.weekday() > 4 else 0

df['weekend'] = df.apply(lambda x: weekend_or_weekday(x['year'], x['month'], x['day']), axis=1)
df.head()

In [None]:
#holiday or not
from datetime import date
import holidays

india_holidays = holidays.country_holidays('IN')
df['holidays'] = df['date'].apply(lambda x: 1 if india_holidays.get(x) else 0)
df.head()

In [None]:
#cyclical features
df['m1'] = np.sin(df['month'] * (2 * np.pi / 12))
df['m2'] = np.cos(df['month'] * (2 * np.pi / 12))
df.head()

In [None]:
#which day of the week it is
def which_day(year, month, day):
    return datetime(year, month, day).weekday()

df['weekday'] = df.apply(lambda x: which_day(x['year'], x['month'], x['day']), axis=1)
df.head()

In [8]:
#columns which are not useful
df.drop('date', axis=1, inplace=True)

In [None]:
#check the unique values
df['store'].nunique(), df['item'].nunique() #10 unique stores and they sell 50 different products.

In [None]:
#Line plot for the average count of stock required on the respective days of the month
plt.figure(figsize=(10,5))
df.groupby('day').mean()['sales'].plot()
plt.show()

In [None]:
#Distribution plot and Box plot for the target column
plt.subplots(figsize=(12, 5))
plt.subplot(1, 2, 1)
sb.distplot(df['sales'])

plt.subplot(1, 2, 2)
sb.boxplot(df['sales'])
plt.show()

In [None]:
#Heatmap to detect the highly correlated features
plt.figure(figsize=(10, 10))
sb.heatmap(df.corr() > 0.8,
           annot=True,
           cbar=False)
plt.show()

In [13]:
#remove outliers
df = df[df['sales']<140]

In [None]:
# 1. Feature–Target Split
features = df.drop(['sales', 'year'], axis=1)
target = df['sales'].values

# 2. Train / Validation / Test Split (70/15/15)

# First split: 70% train, 30% temp
X_train, X_temp, Y_train, Y_temp = train_test_split(
    features, target, test_size=0.30, random_state=22
)

# Second split: 15% val, 15% test (split temp into half)
X_val, X_test, Y_val, Y_test = train_test_split(
    X_temp, Y_temp, test_size=0.50, random_state=22
)

print("Train:", X_train.shape)
print("Validation:", X_val.shape)
print("Test:", X_test.shape)

# ----------------------------------------
# 3. Scaling (fit on train only!)
# ----------------------------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)


In [15]:
models = [
    LinearRegression(),
    XGBRegressor(),
    Lasso(),
    Ridge()
]

model_names = ["Linear Regression", "XGBoost", "Lasso", "Ridge"]

for model, name in zip(models, model_names):

    print(f"\n========== {name} ==========")

    # Train the model
    model.fit(X_train, Y_train)

    # ---- TRAIN ----
    train_preds = model.predict(X_train)
    train_mae  = MAE(Y_train, train_preds)
    train_rmse = MSE(Y_train, train_preds, squared=False)
    train_r2   = r2_score(Y_train, train_preds)

    print("Train:")
    print("  MAE :", train_mae)
    print("  RMSE:", train_rmse)
    print("  R²  :", train_r2)

    # ---- VALIDATION ----
    val_preds = model.predict(X_val)
    val_mae  = MAE(Y_val, val_preds)
    val_rmse = MSE(Y_val, val_preds, squared=False)
    val_r2   = r2_score(Y_val, val_preds)

    print("\nValidation:")
    print("  MAE :", val_mae)
    print("  RMSE:", val_rmse)
    print("  R²  :", val_r2)

    # ---- TEST ----
    test_preds = model.predict(X_test)
    test_mae  = MAE(Y_test, test_preds)
    test_rmse = MSE(Y_test, test_preds, squared=False)
    test_r2   = r2_score(Y_test, test_preds)

    print("\nTest:")
    print("  MAE :", test_mae)
    print("  RMSE:", test_rmse)
    print("  R²  :", test_r2)


Train:
  MAE : 20.898077939319
  RMSE: 25.63071858951156
  R²  : 0.13941780100688606

Validation:
  MAE : 20.905011322015643
  RMSE: 25.626009321317227
  R²  : 0.13578008023421506

Test:
  MAE : 20.95223371158272
  RMSE: 25.680274592601567
  R²  : 0.13654176756470116

Train:
  MAE : 6.905643045329074
  RMSE: 9.012696560168344
  R²  : 0.8935905268916036

Validation:
  MAE : 6.943699867482314
  RMSE: 9.064515927901594
  R²  : 0.8918686697895872

Test:
  MAE : 6.955353434240584
  RMSE: 9.075479760714744
  R²  : 0.8921596736948244

Train:
  MAE : 21.01214278563312
  RMSE: 25.719824232862678
  R²  : 0.1334237416004559

Validation:
  MAE : 21.001632401573218
  RMSE: 25.704175852489758
  R²  : 0.13049981202074656

Test:
  MAE : 21.053733216474427
  RMSE: 25.763214727104728
  R²  : 0.13095530154141033

Train:
  MAE : 20.89807813604901
  RMSE: 25.630718589520576
  R²  : 0.13941780100628054

Validation:
  MAE : 20.905011428352932
  RMSE: 25.62600927415123
  R²  : 0.13578008341549797

Test:
  MA