In [2]:
!pip install xgboost lightgbm



In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
train = pd.read_csv('/content/drive/My Drive/shared data/train.csv')
features = pd.read_csv('/content/drive/My Drive/shared data/features.csv')
stores = pd.read_csv('/content/drive/My Drive/shared data/stores.csv')
customers_train = pd.read_csv('/content/drive/My Drive/shared data/customer_train.csv')

In [6]:
train['Date'] = pd.to_datetime(train['Date'])
features['Date'] = pd.to_datetime(features['Date'])

In [7]:
train_merged = train.merge(features, on=['Store','Date','IsHoliday'], how='left')

In [8]:
train_merged = train_merged.merge(stores, on='Store', how='left')

In [9]:
print(f"ðŸ”¹ Ø·ÙˆÙ„ train_merged: {len(train_merged)}")
print(f"ðŸ”¹ Ø·ÙˆÙ„ customer_train: {len(customers_train)}")
min_len = min(len(train_merged), len(customers_train))

train_trimmed = train_merged.iloc[:min_len].reset_index(drop=True)
customers_trimmed = customers_train.iloc[:min_len].reset_index(drop=True)

train_full = pd.concat([train_trimmed, customers_trimmed], axis=1)

ðŸ”¹ Ø·ÙˆÙ„ train_merged: 421570
ðŸ”¹ Ø·ÙˆÙ„ customer_train: 421570


In [10]:
# train_full['Month'] = train_full['Date'].dt.strftime('%b %Y')
# train_full['Date'].dt.strftime('%b')

In [11]:
train_full['Fuel_Category'] = pd.cut(train_full['Fuel_Price'], bins=10)


In [12]:
train_full['HasPromotion'] = train_full[['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']].sum(axis=1) > 0

In [13]:
train_full = train_full.drop_duplicates()

markdown_cols = ['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']
train_full[markdown_cols] = train_full[markdown_cols].fillna(0)

cols = [
    'Weekly_Sales', 'Temperature', 'MarkDown1', 'MarkDown2', 'MarkDown3',
    'MarkDown4', 'MarkDown5', 'Unemployment',
    'Num_Customers', 'Avg_Spend_per_Customer', 'Loyalty_Avg'
]


for col in cols:
    q1 = train_full[col].quantile(0.25)
    q3 = train_full[col].quantile(0.75)
    iqr = q3 - q1
    upper = q3 + 1.5 * iqr
    lower = q1 - 1.5 * iqr

    before_outliers = ((train_full[col] < lower) | (train_full[col] > upper)).sum()
    train_full[col] = train_full[col].clip(lower, upper)
    after_outliers = ((train_full[col] < lower) | (train_full[col] > upper)).sum()

    print(f"ðŸ”¹ {col}: {before_outliers} outliers were clipped to within bounds")

print("ØªÙ…Øª Ø§Ù„Ù…Ø¹Ø§Ù„Ø¬Ø© train_full")

ðŸ”¹ Weekly_Sales: 35521 outliers were clipped to within bounds
ðŸ”¹ Temperature: 69 outliers were clipped to within bounds
ðŸ”¹ MarkDown1: 55789 outliers were clipped to within bounds
ðŸ”¹ MarkDown2: 103148 outliers were clipped to within bounds
ðŸ”¹ MarkDown3: 84674 outliers were clipped to within bounds
ðŸ”¹ MarkDown4: 79134 outliers were clipped to within bounds
ðŸ”¹ MarkDown5: 40458 outliers were clipped to within bounds
ðŸ”¹ Unemployment: 32114 outliers were clipped to within bounds
ðŸ”¹ Num_Customers: 36494 outliers were clipped to within bounds
ðŸ”¹ Avg_Spend_per_Customer: 69644 outliers were clipped to within bounds
ðŸ”¹ Loyalty_Avg: 18225 outliers were clipped to within bounds
ØªÙ…Øª Ø§Ù„Ù…Ø¹Ø§Ù„Ø¬Ø© train_full


In [14]:
train_full['Year'] = train_full['Date'].dt.year
train_full['Month'] = train_full['Date'].dt.month
train_full['Week'] = train_full['Date'].dt.isocalendar().week
train_full['Day'] = train_full['Date'].dt.day
train_full['DayOfWeek'] = train_full['Date'].dt.dayofweek

In [15]:
train_full = train_full.sort_values(['Store', 'Dept', 'Date'])

train_full['Sales_Lag_4Weeks'] = train_full.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(4)

In [16]:
train_full['IsHoliday'] = train_full['IsHoliday'].astype(int)
train_full['HasPromotion'] = train_full['HasPromotion'].astype(int)
train_full['Sales_Lag_4Weeks'] = train_full['Sales_Lag_4Weeks'].fillna(0).astype(int)


train_full.drop('Date', axis=1, inplace=True)

In [17]:
data = pd.get_dummies(train_full, columns=['Type', 'Fuel_Category'])

In [18]:
data.head()

Unnamed: 0,Store,Dept,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,...,"Fuel_Category_(2.47, 2.672]","Fuel_Category_(2.672, 2.871]","Fuel_Category_(2.871, 3.071]","Fuel_Category_(3.071, 3.27]","Fuel_Category_(3.27, 3.47]","Fuel_Category_(3.47, 3.67]","Fuel_Category_(3.67, 3.869]","Fuel_Category_(3.869, 4.069]","Fuel_Category_(4.069, 4.268]","Fuel_Category_(4.268, 4.468]"
0,1,1,24924.5,0,42.31,2.572,0.0,0.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
1,1,1,46039.49,1,38.51,2.548,0.0,0.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
2,1,1,41595.55,0,39.93,2.514,0.0,0.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
3,1,1,19403.54,0,46.63,2.561,0.0,0.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
4,1,1,21827.9,0,46.5,2.625,0.0,0.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False


In [19]:
data.columns = data.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

X = data.drop('Weekly_Sales', axis=1)
y = data['Weekly_Sales']

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

split_index = int(len(X) * 0.8)
X_train, X_test, y_train, y_test = (
    X.iloc[:split_index],
    X.iloc[split_index:],
    y.iloc[:split_index],
    y.iloc[split_index:]
)

# Scaling X data
X_train_scale = scaler_X.fit_transform(X_train)
X_test_scale = scaler_X.transform(X_test)

# Scaling y data
y_train_scale = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_test_scale = scaler_y.transform(y_test.values.reshape(-1, 1))

# Transforming data to dataframe and series
X_train_scaled = pd.DataFrame(X_train_scale, columns=X_train.columns, index=X_train.index)
X_test_scaled  = pd.DataFrame(X_test_scale,  columns=X_test.columns,  index=X_test.index)

y_train_scaled = pd.Series(y_train_scale.flatten(), index=y_train.index, name='Weekly_Sales')
y_test_scaled = pd.Series(y_test_scale.flatten(), index=y_test.index, name='Weekly_Sales')

# Sampling the train data
sample_idx = y_train_scaled.sample(frac=0.1, random_state=42).index

df_xsample_train = X_train_scaled.loc[sample_idx]
df_ysample_train = y_train_scaled.loc[sample_idx]

In [20]:
xgb_model = XGBRegressor(objective='reg:squarederror', eval_metric='rmse')
xgb_params = {
    'n_estimators': [200, 500, 800],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.8, 1.0]
}


lgb_model = LGBMRegressor()
lgb_params = {
    'n_estimators': [200, 500, 800],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [15, 31, 63],
    'max_depth': [-1, 5, 10],
    'subsample': [0.7, 0.9, 1.0],
    'colsample_bytree': [0.7, 1.0]
}


rf_model = RandomForestRegressor()
rf_params = {
    'n_estimators': [200, 500, 800],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


gbr_model = GradientBoostingRegressor()
gbr_params = {
    'n_estimators': [200, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [2, 3, 5]
}


models = [xgb_model, rf_model, gbr_model, lgb_model]
parameters = [xgb_params, rf_params, gbr_params, lgb_params]
model_names = ['xgb', 'rf', 'gbr', 'lgb']

all_models = []

def smape(y_true, y_pred):
    return np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))) * 100

In [21]:
linear_model=LinearRegression()
linear_model.fit(df_xsample_train, df_ysample_train)
preds=linear_model.predict(X_test_scaled)

# Evaluation
mae = mean_absolute_error(y_test_scaled.values, preds)
rmse = np.sqrt(mean_squared_error(y_test_scaled.values, preds))
r2 = r2_score(y_test_scaled.values, preds)
mape = mean_absolute_percentage_error(y_test_scaled.values, preds)
smape_val = smape(y_test_scaled.values, preds)
accuracy = 100 - smape_val
print("MAE:", mae)
print("RMSE:", rmse)
print("R2:", r2)
print("MAPE:", mape)
print("SMAPE:", smape_val)
print("Approx Accuracy:", accuracy, "%")

MAE: 0.01357146862641912
RMSE: 0.022174664482516857
R2: 0.9933903571298478
MAPE: 0.05133459137800144
SMAPE: 5.23798288832382
Approx Accuracy: 94.76201711167619 %


In [None]:
# Collect best models, params and metrics then save them to disk
results = []
import os, joblib, json
os.makedirs('best_models', exist_ok=True)

for model, params, name in zip(models, parameters, model_names):
  tscv = TimeSeriesSplit(n_splits=5)
  random = RandomizedSearchCV(
    estimator=model,
    param_distributions=params,
    cv=tscv,
    scoring='neg_mean_absolute_error',
    n_iter=10,
    n_jobs=4,
    verbose=2,
    random_state=42
  )

  random.fit(df_xsample_train, df_ysample_train)
  best_model = random.best_estimator_
  best_params = random.best_params_
  all_models.append(best_model)

  # persist the fitted estimator
  model_path = os.path.join('best_models', f'{name}_best_model.joblib')
  joblib.dump(best_model, model_path)

  # predictions and metrics
  preds = best_model.predict(X_test_scaled)
  mae = mean_absolute_error(y_test_scaled.values, preds)
  rmse = np.sqrt(mean_squared_error(y_test_scaled.values, preds))
  r2 = r2_score(y_test_scaled.values, preds)
  mape = mean_absolute_percentage_error(y_test_scaled.values, preds)
  smape_val = smape(y_test_scaled.values, preds)
  accuracy = 100 - smape_val

  print('For', name + ':')
  print("MAE:", mae)
  print("RMSE:", rmse)
  print("R2:", r2)
  print("MAPE:", mape)
  print("SMAPE:", smape_val)
  print("Approx Accuracy:", accuracy, "%")
  print()
  print(best_params)
  print()

  # Append results (convert numerical metrics to native Python types)
  results.append({
    'model_name': name,
    'model_path': model_path,
    'best_params': best_params,
    'mae': float(mae),
    'rmse': float(rmse),
    'r2': float(r2),
    'mape': float(mape),
    'smape': float(smape_val),
    'accuracy': float(accuracy)
  })

# Save summary to disk (JSON) - params may contain numpy types, so use default=str
with open(os.path.join('best_models','best_params_summary.json'),'w') as f:
  json.dump(results, f, default=str, indent=2)

import pandas as pd
metrics_df = pd.DataFrame(results)
print('\nSummary of models:')
print(metrics_df[[ 'model_name','mae','rmse','r2','mape','smape','accuracy']])

In [None]:
results

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

In [24]:
train_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421570 entries, 0 to 421569
Data columns (total 26 columns):
 #   Column                  Non-Null Count   Dtype   
---  ------                  --------------   -----   
 0   Store                   421570 non-null  int64   
 1   Dept                    421570 non-null  int64   
 2   Weekly_Sales            421570 non-null  float64 
 3   IsHoliday               421570 non-null  int64   
 4   Temperature             421570 non-null  float64 
 5   Fuel_Price              421570 non-null  float64 
 6   MarkDown1               421570 non-null  float64 
 7   MarkDown2               421570 non-null  float64 
 8   MarkDown3               421570 non-null  float64 
 9   MarkDown4               421570 non-null  float64 
 10  MarkDown5               421570 non-null  float64 
 11  CPI                     421570 non-null  float64 
 12  Unemployment            421570 non-null  float64 
 13  Type                    421570 non-null  object  
 14  Size

In [25]:
X = train_full.drop("Weekly_Sales", axis=1)
y = train_full["Weekly_Sales"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop='first', handle_unknown='ignore'), cat_cols),
         ("num", MinMaxScaler(), num_cols)

    ],
    remainder='passthrough'
)

lgbm_reg = LGBMRegressor(
    subsample=0.9,
    num_leaves=31,
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    colsample_bytree=1.0
)


model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", lgbm_reg)
])


model.fit(X_train, y_train)

y_pred = model.predict(X_test)
r2 = r2_score(y_test.values, y_pred)
print("R2:", r2)


print("Training Completed. Sample Predictions:", y_pred[:5])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042883 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3158
[LightGBM] [Info] Number of data points in the train set: 337256, number of used features: 33
[LightGBM] [Info] Start training from score 13655.762306




R2: 0.9980260934101743
Training Completed. Sample Predictions: [47510.09472285  3407.10284364 10423.94592596  3436.07195954
  6226.56628357]


In [32]:
# Saving the model for deployment
import joblib
try:
    print("Saving pipeline!....")
    pipeline_filename = 'gbr_pipeline.pkl'
    joblib.dump(model, pipeline_filename);
    print("Pipeline saved successfully!")

except Exception as err:
    print(f"Unexpected {err=}, {type(err)=}")

Saving pipeline!....
Pipeline saved successfully!
