In [5]:
#%pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
import lightgbm as lgb

In [11]:
df = pd.read_csv('cleaned_sales_data.csv', parse_dates=['date'])
df.head()


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dcoilwtico,is_holiday,month,dayofweek,year
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,93.14,1,1,1,2013
1,1,2013-01-01,1,BABY CARE,0.0,0,93.14,1,1,1,2013
2,2,2013-01-01,1,BEAUTY,0.0,0,93.14,1,1,1,2013
3,3,2013-01-01,1,BEVERAGES,0.0,0,93.14,1,1,1,2013
4,4,2013-01-01,1,BOOKS,0.0,0,93.14,1,1,1,2013


In [13]:
#Feature Engineering

#Make date more specific by seperating 
df['day'] = df['date'].dt.day
df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
df['quarter'] = df['date'].dt.quarter

#sort store by family an data
df = df.sort_values(['store_nbr', 'family', 'date'])

#lag features: explains past
df['lag_7'] = df.groupby(['store_nbr', 'family'])['sales'].shift(7)
df['lag_30'] = df.groupby(['store_nbr', 'family'])['sales'].shift(30)

#rolling window statistics:
df['rolling_mean_7'] = df.groupby(['store_nbr', 'family'])['sales'].shift(1).rolling(7).mean() #average of sales 7 days
df['rolling_std_7'] = df.groupby(['store_nbr', 'family'])['sales'].shift(1).rolling(7).std() #variation of sales 7 days

#Encode categorical variables
from sklearn.preprocessing import LabelEncoder
for col in ['store_nbr', 'family']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# 6. Fill NaNs created by lag/rolling features
df = df.fillna(0)

print("feature engineering finished")
print(df.columns.tolist())


feature engineering finished
['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion', 'dcoilwtico', 'is_holiday', 'month', 'dayofweek', 'year', 'day', 'weekofyear', 'quarter', 'lag_7', 'lag_30', 'rolling_mean_7', 'rolling_std_7']


In [15]:
#split data into before 2017(train) and after(test)
cutoff_date = '2017-01-01'

train_df = df[df['date'] < cutoff_date]
test_df = df[df['date'] >= cutoff_date]

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")


Train shape: (2642706, 18)
Test shape: (411642, 18)


In [19]:
#choose features and targets

target = 'sales'
features = [
    'store_nbr', 'family', 'onpromotion', 'dcoilwtico',
    'is_holiday', 'month', 'dayofweek', 'year',
    'lag_7', 'lag_30', 'rolling_mean_7', 'rolling_std_7'
]

X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]

In [21]:
# linear regression baseline model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error, mean_squared_error
import numpy as np

lr = LinearRegression()
lr.fit(X_train, y_train)

# Make predictions
y_pred = lr.predict(X_test)

# Clip to avoid negative sales
y_pred = np.clip(y_pred, 0, None)

# Evaluate
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"RMSLE: {rmsle:.4f}")
print(f"RMSE: {rmse:.2f}")

RMSLE: 1.3241
RMSE: 386.33
