<a href="https://colab.research.google.com/github/SHShifat/CSE465_Spring2025_Group-13/blob/main/LSTM_%26_XGBoost_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv('/content/train.csv')
print("Dataset loaded. Columns:", df.columns)
df['Date'] = pd.to_datetime(df['Order Date'], dayfirst=True)
df = df.sort_values('Date').reset_index(drop=True)

daily_sales = df.groupby('Date')['Sales'].sum().reset_index()

daily_sales.set_index('Date', inplace=True)
daily_sales = daily_sales.asfreq('D')

# Median
daily_sales['Sales'] = daily_sales['Sales'].fillna(daily_sales['Sales'].median())

daily_sales['Sales'] = np.log1p(daily_sales['Sales'])

daily_sales.reset_index(inplace=True)

# Feature Engineering
daily_sales['dayofweek'] = daily_sales['Date'].dt.dayofweek
daily_sales['day']       = daily_sales['Date'].dt.day
daily_sales['month']     = daily_sales['Date'].dt.month
daily_sales['quarter']   = daily_sales['Date'].dt.quarter
daily_sales['year']      = daily_sales['Date'].dt.year
daily_sales['is_weekend'] = daily_sales['dayofweek'].isin([5, 6]).astype(int)

print(" Part 1 complete — Preprocessing and feature engineering done.")
print(daily_sales.head())

Dataset loaded. Columns: Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State',
       'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name', 'Sales'],
      dtype='object')
 Part 1 complete — Preprocessing and feature engineering done.
        Date     Sales  dayofweek  day  month  quarter  year  is_weekend
0 2015-01-03  2.859225          5    3      1        1  2015           1
1 2015-01-04  5.666634          6    4      1        1  2015           1
2 2015-01-05  3.022179          0    5      1        1  2015           0
3 2015-01-06  8.391199          1    6      1        1  2015           0
4 2015-01-07  4.479131          2    7      1        1  2015           0


In [4]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def create_sequences(data, window, horizon):
    X, y = [], []
    for i in range(len(data) - window - horizon + 1):
        X.append(data[i:i+window])
        y.append(data[i+window+horizon-1][0])
    return np.array(X), np.array(y)

# Features
features = ['Sales', 'dayofweek', 'day', 'month', 'quarter', 'year', 'is_weekend']
data = daily_sales[features].copy()


scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)

window_size = 30
horizon = 14

sales_scaled = scaled_data[:, 0].reshape(-1, 1)
X_lstm, y_lstm = create_sequences(scaled_data, window_size, horizon)
X_lstm = X_lstm[:, :, 0].reshape((X_lstm.shape[0], X_lstm.shape[1], 1))  # use only Sales for LSTM

# LSTM model
model_lstm = Sequential()
model_lstm.add(LSTM(64, activation='relu', input_shape=(window_size, 1)))
model_lstm.add(Dense(1))
model_lstm.compile(optimizer='adam', loss='mse')
model_lstm.fit(X_lstm, y_lstm, epochs=20, verbose=0)

# LSTM predictions
lstm_preds = model_lstm.predict(X_lstm).flatten()

# XGBoost with full multivariate features
X_xgb, y_xgb = create_sequences(scaled_data, window_size, horizon)
X_xgb = X_xgb.reshape((X_xgb.shape[0], X_xgb.shape[1] * X_xgb.shape[2]))

# XGBoost model
model_xgb = XGBRegressor(n_estimators=100, learning_rate=0.1)
model_xgb.fit(X_xgb, y_xgb)

# XGBoost predictions
xgb_preds = model_xgb.predict(X_xgb)

hybrid_preds = (lstm_preds + xgb_preds) / 2

def inverse_sales(scaled_sales):
    dummy = np.zeros((len(scaled_sales), scaled_data.shape[1]))
    dummy[:, 0] = scaled_sales
    return scaler.inverse_transform(dummy)[:, 0]

true_values = y_lstm
true_inverse = inverse_sales(true_values)
hybrid_inverse = inverse_sales(hybrid_preds)

mae = mean_absolute_error(true_inverse, hybrid_inverse)
rmse = np.sqrt(mean_squared_error(true_inverse, hybrid_inverse))
r2 = r2_score(true_inverse, hybrid_inverse)
smape = 100 * np.mean(2 * np.abs(hybrid_inverse - true_inverse) / (np.abs(hybrid_inverse) + np.abs(true_inverse)))

# Results
print("Final Forecast - Evaluation Metrics:")
print(f"# MAE   : {mae:.2f}")
print(f"# RMSE  : {rmse:.2f}")
print(f"# R²    : {r2:.4f}")
print(f"# SMAPE : {smape:.2f}%")

  super().__init__(**kwargs)


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step
Final Forecast - Evaluation Metrics:
# MAE   : 0.63
# RMSE  : 0.84
# R²    : 0.6236
# SMAPE : 10.34%
