In [1]:
!pip install lightgbm
!pip install scikit-learn



In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_squared_log_error
import lightgbm as lgb

In [2]:
df = pd.read_csv('cleaned_sales_data.csv', parse_dates=['date'])
df.head(20)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dcoilwtico,is_holiday,month,dayofweek,year
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,93.14,1,1,1,2013
1,1,2013-01-01,1,BABY CARE,0.0,0,93.14,1,1,1,2013
2,2,2013-01-01,1,BEAUTY,0.0,0,93.14,1,1,1,2013
3,3,2013-01-01,1,BEVERAGES,0.0,0,93.14,1,1,1,2013
4,4,2013-01-01,1,BOOKS,0.0,0,93.14,1,1,1,2013
5,5,2013-01-01,1,BREAD/BAKERY,0.0,0,93.14,1,1,1,2013
6,6,2013-01-01,1,CELEBRATION,0.0,0,93.14,1,1,1,2013
7,7,2013-01-01,1,CLEANING,0.0,0,93.14,1,1,1,2013
8,8,2013-01-01,1,DAIRY,0.0,0,93.14,1,1,1,2013
9,9,2013-01-01,1,DELI,0.0,0,93.14,1,1,1,2013


In [12]:
#Feature Engineering

#Make date more specific by seperating (feature)
df['day'] = df['date'].dt.day
df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
df['quarter'] = df['date'].dt.quarter

#sort store by family an data (feature)
df = df.sort_values(['store_nbr', 'family', 'date'])

#lag features: explains past (feature)
df['lag_7'] = df.groupby(['store_nbr', 'family'])['sales'].shift(7)
df['lag_30'] = df.groupby(['store_nbr', 'family'])['sales'].shift(30)

#rolling window statistics: (feature)
df['rolling_mean_7'] = df.groupby(['store_nbr', 'family'])['sales'].shift(1).rolling(7).mean() #average of sales 7 days
df['rolling_std_7'] = df.groupby(['store_nbr', 'family'])['sales'].shift(1).rolling(7).std() #variation of sales 7 days

# is weekend: 1 if sat/sun, 0 if not weekend (feature)
df['is_weekend'] = df['dayofweek'] >= 5

# whether it is the start of the month or end of month:
df['is_month_start'] = df['date'].dt.is_month_start
df['is_month_end'] = df['date'].dt.is_month_end

#changing categorical variables to integers (edit)
from sklearn.preprocessing import LabelEncoder
for col in ['store_nbr', 'family']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

#fill NaNs (edit)
df = df.fillna(0)

print("feature engineering finished")
print(df.columns.tolist())

feature engineering finished
['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion', 'dcoilwtico', 'is_holiday', 'month', 'dayofweek', 'year', 'day', 'weekofyear', 'quarter', 'lag_7', 'lag_30', 'rolling_mean_7', 'rolling_std_7', 'is_weekend', 'is_month_start', 'is_month_end']
0        True
1782    False
3564    False
5346    False
7128    False
Name: is_month_start, dtype: bool


In [14]:
#split data into before 2017(train) and after(test)
cutoff_date = '2017-01-01'

train_df = df[df['date'] < cutoff_date]
test_df = df[df['date'] >= cutoff_date]

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")


Train shape: (2642706, 21)
Test shape: (411642, 21)


In [15]:
#choose features and targets

target = 'sales'
features = [
    'store_nbr', 'family', 'onpromotion', 'dcoilwtico',
    'is_holiday', 'month', 'dayofweek', 'year',
    'lag_7', 'lag_30', 'rolling_mean_7', 'rolling_std_7',
    'is_weekend', 'is_month_start', 'is_month_end'
]

X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]

In [16]:
# linear regression baseline model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error, mean_squared_error
import numpy as np

lr = LinearRegression()
lr.fit(X_train, y_train)

# Make predictions
y_pred = lr.predict(X_test)

# Avoid negative sales
y_pred = np.clip(y_pred, 0, None)

# Evaluate
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"RMSLE: {rmsle:.4f}")
print(f"RMSE: {rmse:.2f}")

RMSLE: 1.6122
RMSE: 384.76


In [17]:
#Train LightGBM with same features

X_valid = test_df[features]
y_valid = test_df[target]

#Tell LightGBM which columns are categorical
categorical_feats = ['store_nbr', 'family']  

#build datasets
lgb_train = lgb.Dataset(X_train, label=y_train, feature_name=features, categorical_feature=categorical_feats)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, feature_name=features, categorical_feature=categorical_feats)

#parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbosity': -1
}

#train model
model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_valid],
    num_boost_round=2000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100)
    ]
)

#calculating scores
pred = model.predict(X_valid, num_iteration=model.best_iteration)
pred = np.clip(pred, 0, None)#making sure sales are not negative

rmse  = np.sqrt(mean_squared_error(y_valid, pred))
rmsle = np.sqrt(mean_squared_log_error(y_valid, pred))
print(f"LightGBM RMSE : {rmse:.2f}")
print(f"LightGBM RMSLE: {rmsle:.4f}")


Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 310.916
[200]	valid_0's rmse: 302.309
[300]	valid_0's rmse: 298.649
[400]	valid_0's rmse: 296.869
[500]	valid_0's rmse: 298.274
Early stopping, best iteration is:
[426]	valid_0's rmse: 296.413
LightGBM RMSE : 296.40
LightGBM RMSLE: 0.7040


In [None]:
#Add more, better features


In [None]:
#Train LightGBM with new features
