In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('train.csv', parse_dates=['date'])
test = pd.read_csv('test.csv', parse_dates=['date'])
stores = pd.read_csv('stores.csv')
oil = pd.read_csv('oil.csv', parse_dates=['date'])
holidays = pd.read_csv('holidays_events.csv', parse_dates=['date'])
transactions = pd.read_csv('transactions.csv', parse_dates=['date'])

In [None]:
print(f"Training Range: {train.date.min()} to {train.date.max()}")
print(f"Testing Range:  {test.date.min()} to {test.date.max()}")

In [None]:
plt.figure(figsize=(15, 5))
# Group by date to see total sales across all stores
daily_sales = train.groupby('date')['sales'].sum()
plt.plot(daily_sales)
plt.title("Total Sales History (Notice the spikes in December!)")
plt.show()

In [None]:
# --- 1. PREPARE THE DATA FRAME ---
# Concatenate train and test sets to ensure consistent feature engineering.
train_len = len(train)
all_data = pd.concat([train, test], sort=False).reset_index(drop=True)
all_data

In [None]:
# --- 2. DATE FEATURES (The "Calendar") ---
# Feature Engineering: Decompose datetime into numerical features for tree-based modeling.
all_data['date'] = pd.to_datetime(all_data['date'])
all_data['year'] = all_data['date'].dt.year
all_data['month'] = all_data['date'].dt.month
all_data['day'] = all_data['date'].dt.day
all_data['dayofweek'] = all_data['date'].dt.dayofweek # 0=Monday, 6=Sunday
all_data['weekend'] = (all_data['dayofweek'] >= 5).astype(int) # 1 if Sat/Sun

In [None]:
# Merge external economic data (Oil Prices).
# Interpolate missing weekend values to maintain time-series continuity..
oil = oil.set_index('date').resample('D').mean().interpolate(limit_direction='both').reset_index()
all_data = pd.merge(all_data, oil, on='date', how='left')

In [None]:
# B. Holidays (Simplify)
# Binary holiday feature: focuses on effective days off rather than holiday names.
holidays = holidays[holidays.transferred == False] # Ignore holidays that were moved
holidays['is_holiday'] = 1
# drop duplicates because some days have multiple holidays (e.g., Local + National)
holiday_map = holidays[['date', 'is_holiday']].drop_duplicates(subset='date')
all_data = pd.merge(all_data, holiday_map, on='date', how='left')
all_data['is_holiday'] = all_data['is_holiday'].fillna(0) # Fill non-holidays with 0

In [None]:
# --- 4. LAG FEATURES ---

all_data['wages_day'] = ((all_data['day'] == 15) | (all_data['day'] == all_data['date'].dt.days_in_month)).astype(int)

print("Feature Engineering Complete. Shape:", all_data.shape)
all_data.sample(10)

In [None]:
!pip install lightgbm
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error

In [None]:
all_data = pd.merge(all_data, stores, on='store_nbr', how='left')

In [None]:
# --- 1. ENCODING (Text -> Numbers) ---
# LabelEncoder for categories like 'Family' (Automotive, Grocery...)
le = LabelEncoder()
all_data['family'] = le.fit_transform(all_data['family'].astype(str))
all_data['type'] = le.fit_transform(all_data['type'].astype(str))
all_data['city'] = le.fit_transform(all_data['city'].astype(str))
all_data['state'] = le.fit_transform(all_data['state'].astype(str))

In [None]:
train_data = all_data[all_data['date'] <= '2017-08-15']
test_data = all_data[all_data['date'] > '2017-08-15']

In [None]:
y = np.log1p(train_data['sales'].values)

In [None]:
# Drop columns we can't use
drop_cols = ['id', 'date', 'sales']
X = train_data.drop(drop_cols, axis=1)
X_test = test_data.drop(drop_cols, axis=1)

In [None]:
# --- 3. HYBRID PART A: THE TREND (Linear Model) ---

X['time_step'] = np.arange(len(X))
X_test['time_step'] = np.arange(len(X), len(X) + len(X_test))

In [None]:
# Use Ridge (Linear Regression with regularization)
# Fit Ridge regression to capture the global linear trend
model_linear = Ridge(alpha=0.5)
model_linear.fit(X[['time_step']], y)

In [None]:
# Generate Trend Predictions
pred_linear_train = model_linear.predict(X[['time_step']])
pred_linear_test = model_linear.predict(X_test[['time_step']])

In [None]:
# Calculate Residuals (What the Linear model missed)
# The Tree model will try to predict THESE residuals, not the raw sales.
y_residuals = y - pred_linear_train

print("Trend Model Trained.")

In [None]:
# Dataset for LGBM
lgb_train = lgb.Dataset(X, y_residuals, categorical_feature=['store_nbr', 'family', 'city'])

In [None]:
# Parameters (Tuned for this dataset)
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,      # Low learning rate for precision
    'num_leaves': 100,          # Complexity of trees
    'feature_fraction': 0.8,    # Randomly select 80% of features (avoids overfitting)
    'bagging_fraction': 0.7,    # Randomly select 70% of rows
    'bagging_freq': 1,
    'verbose': -1,
    'seed': 42
}

print("Training LightGBM (Seasonality)...")
# train for 1000 rounds
model_lgb = lgb.train(params, lgb_train, num_boost_round=1000)

print("Hybrid Training Complete.")

In [None]:
# 1. Predict with both models
lgb_pred = model_lgb.predict(X_test)

In [None]:
# 2. Combine (Additive)
# Final = Trend + Residuals
final_log_pred = pred_linear_test + lgb_pred

In [None]:
# 3. Inverse Transform (Log -> Real Sales)
# We must force negative predictions to 0 (Sales can't be negative)
final_sales = np.expm1(final_log_pred)
final_sales = np.clip(final_sales, 0, None)

In [None]:
# 4. Create Submission File
submission = pd.DataFrame({
    'id': test_data['id'],
    'sales': final_sales
})

submission.to_csv('submission_hybrid.csv', index=False)
print("Submission Ready! This uses a Detrended-Hybrid approach.")