In [None]:
#import required libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.svm import SVC
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')

In [23]:
df = pd.read_csv('train.csv')
#Basic Inspection
display(df.head())
display(df.tail())
df.describe()


Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


Unnamed: 0,date,store,item,sales
912995,2017-12-27,10,50,63
912996,2017-12-28,10,50,59
912997,2017-12-29,10,50,74
912998,2017-12-30,10,50,62
912999,2017-12-31,10,50,82


Unnamed: 0,store,item,sales
count,913000.0,913000.0,913000.0
mean,5.5,25.5,52.250287
std,2.872283,14.430878,28.801144
min,1.0,1.0,0.0
25%,3.0,13.0,30.0
50%,5.5,25.5,47.0
75%,8.0,38.0,70.0
max,10.0,50.0,231.0


In [24]:
#feature engineering
parts = df["date"].str.split("-", n = 3, expand = True)
df["year"]= parts[0].astype('int')
df["month"]= parts[1].astype('int')
df["day"]= parts[2].astype('int')
df.head()

Unnamed: 0,date,store,item,sales,year,month,day
0,2013-01-01,1,1,13,2013,1,1
1,2013-01-02,1,1,11,2013,1,2
2,2013-01-03,1,1,14,2013,1,3
3,2013-01-04,1,1,13,2013,1,4
4,2013-01-05,1,1,10,2013,1,5


In [26]:
#weekend or a weekday
from datetime import datetime

def weekend_or_weekday(year, month, day):
    d = datetime(year, month, day)
    return 1 if d.weekday() > 4 else 0

df['weekend'] = df.apply(lambda x: weekend_or_weekday(x['year'], x['month'], x['day']), axis=1)
df.head()

Unnamed: 0,date,store,item,sales,year,month,day,weekend
0,2013-01-01,1,1,13,2013,1,1,0
1,2013-01-02,1,1,11,2013,1,2,0
2,2013-01-03,1,1,14,2013,1,3,0
3,2013-01-04,1,1,13,2013,1,4,0
4,2013-01-05,1,1,10,2013,1,5,1


In [27]:
#holiday or not
from datetime import date
import holidays

india_holidays = holidays.country_holidays('IN')
df['holidays'] = df['date'].apply(lambda x: 1 if india_holidays.get(x) else 0)
df.head()

Unnamed: 0,date,store,item,sales,year,month,day,weekend,holidays
0,2013-01-01,1,1,13,2013,1,1,0,0
1,2013-01-02,1,1,11,2013,1,2,0,0
2,2013-01-03,1,1,14,2013,1,3,0,0
3,2013-01-04,1,1,13,2013,1,4,0,0
4,2013-01-05,1,1,10,2013,1,5,1,0


In [6]:
#cyclical features
df['m1'] = np.sin(df['month'] * (2 * np.pi / 12))
df['m2'] = np.cos(df['month'] * (2 * np.pi / 12))
df.head()

Unnamed: 0,date,store,item,sales,year,month,day,weekend,holidays,m1,m2
0,2013-01-01,1,1,13,2013,1,1,0,0,0.5,0.866025
1,2013-01-02,1,1,11,2013,1,2,0,0,0.5,0.866025
2,2013-01-03,1,1,14,2013,1,3,0,0,0.5,0.866025
3,2013-01-04,1,1,13,2013,1,4,0,0,0.5,0.866025
4,2013-01-05,1,1,10,2013,1,5,1,0,0.5,0.866025


In [28]:
#which day of the week it is
def which_day(year, month, day):
    return datetime(year, month, day).weekday()

df['weekday'] = df.apply(lambda x: which_day(x['year'], x['month'], x['day']), axis=1)
df.head()

Unnamed: 0,date,store,item,sales,year,month,day,weekend,holidays,weekday
0,2013-01-01,1,1,13,2013,1,1,0,0,1
1,2013-01-02,1,1,11,2013,1,2,0,0,2
2,2013-01-03,1,1,14,2013,1,3,0,0,3
3,2013-01-04,1,1,13,2013,1,4,0,0,4
4,2013-01-05,1,1,10,2013,1,5,1,0,5


In [8]:
#columns which are not useful
df.drop('date', axis=1, inplace=True)

In [9]:
#check the unique values
df['store'].nunique(), df['item'].nunique() #10 unique stores and they sell 50 different products.

(10, 50)

In [None]:
#Line plot for the average count of stock required on the respective days of the month
plt.figure(figsize=(10,5))
df.groupby('day').mean()['sales'].plot()
plt.show()

In [None]:
#Distribution plot and Box plot for the target column
plt.subplots(figsize=(12, 5))
plt.subplot(1, 2, 1)
sb.distplot(df['sales'])

plt.subplot(1, 2, 2)
sb.boxplot(df['sales'])
plt.show()

In [None]:
#Heatmap to detect the highly correlated features
plt.figure(figsize=(10, 10))
sb.heatmap(df.corr() > 0.8,
           annot=True,
           cbar=False)
plt.show()

In [13]:
#remove outliers
df = df[df['sales']<140]

In [None]:
# 1. Feature–Target Split
features = df.drop(['sales', 'year'], axis=1)
target = df['sales'].values

# 2. Train / Validation / Test Split (70/15/15)

# First split: 70% train, 30% temp
X_train, X_temp, Y_train, Y_temp = train_test_split(
    features, target, test_size=0.30, random_state=22
)

# Second split: 15% val, 15% test (split temp into half)
X_val, X_test, Y_val, Y_test = train_test_split(
    X_temp, Y_temp, test_size=0.50, random_state=22
)

print("Train:", X_train.shape)
print("Validation:", X_val.shape)
print("Test:", X_test.shape)

# ----------------------------------------
# 3. Scaling (fit on train only!)
# ----------------------------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)


In [None]:
models = [
    LinearRegression(),
    XGBRegressor(),
    Lasso(),
    Ridge()
]

model_names = ["Linear Regression", "XGBoost", "Lasso", "Ridge"]

for model, name in zip(models, model_names):

    print(f"\n========== {name} ==========")

    # Train the model
    model.fit(X_train, Y_train)

    # ---- TRAIN ----
    train_preds = model.predict(X_train)
    train_mae  = MAE(Y_train, train_preds)
    train_rmse = MSE(Y_train, train_preds, squared=False)
    train_r2   = r2_score(Y_train, train_preds)

    print("Train:")
    print("  MAE :", train_mae)
    print("  RMSE:", train_rmse)
    print("  R²  :", train_r2)

    # ---- VALIDATION ----
    val_preds = model.predict(X_val)
    val_mae  = MAE(Y_val, val_preds)
    val_rmse = MSE(Y_val, val_preds, squared=False)
    val_r2   = r2_score(Y_val, val_preds)

    print("\nValidation:")
    print("  MAE :", val_mae)
    print("  RMSE:", val_rmse)
    print("  R²  :", val_r2)

    # ---- TEST ----
    test_preds = model.predict(X_test)
    test_mae  = MAE(Y_test, test_preds)
    test_rmse = MSE(Y_test, test_preds, squared=False)
    test_r2   = r2_score(Y_test, test_preds)

    print("\nTest:")
    print("  MAE :", test_mae)
    print("  RMSE:", test_rmse)
    print("  R²  :", test_r2)