In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from lightgbm.callback import early_stopping, log_evaluation
import holidays

# Load data
train = pd.read_csv("train_JDXlpm8/train/train.csv", parse_dates=["doj"])
test = pd.read_csv("test_8gqdJqH.csv", parse_dates=["doj"])
transactions = pd.read_csv("train_JDXlpm8/train/transactions.csv", parse_dates=["doj", "doi"])

# Filter and aggregate transaction features for dbd=15 only
def calc_tx_features(transactions, dbd):
    tx = transactions[transactions['dbd'] == dbd].copy()
    tx = tx.groupby(['srcid', 'destid', 'doj']).agg({
        'cumsum_seatcount': 'sum',
        'cumsum_searchcount': 'sum',
        'srcid_region': 'first',
        'destid_region': 'first',
        'srcid_tier': 'first',
        'destid_tier': 'first'
    }).reset_index()
    tx.rename(columns={
        'cumsum_seatcount': f'seatcount_{dbd}',
        'cumsum_searchcount': f'searchcount_{dbd}'
    }, inplace=True)
    return tx

tx_15 = calc_tx_features(transactions, 15)

# Merge features into train and test
train_merged = train.merge(tx_15, on=['srcid', 'destid', 'doj'], how='left')
test_merged = test.merge(tx_15, on=['srcid', 'destid', 'doj'], how='left')

# Route-level averages for dbd=15
agg_15 = transactions[transactions['dbd'] == 15].groupby(['srcid', 'destid'])[['cumsum_seatcount', 'cumsum_searchcount']].mean().reset_index()
agg_15.rename(columns={
    'cumsum_seatcount': 'avg_seatcount_15',
    'cumsum_searchcount': 'avg_searchcount_15'
}, inplace=True)
train_merged = train_merged.merge(agg_15, on=['srcid', 'destid'], how='left')
test_merged = test_merged.merge(agg_15, on=['srcid', 'destid'], how='left')

# Feature engineering
for df in [train_merged, test_merged]:
    df['day_of_week'] = df['doj'].dt.dayofweek
    df['month'] = df['doj'].dt.month
    df['week_of_year'] = df['doj'].dt.isocalendar().week.astype(int)
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['seat_to_search_15'] = df['seatcount_15'] / (df['searchcount_15'] + 1)
    df['is_wedding_season'] = df['month'].isin([4, 5, 6, 11, 12]).astype(int)
    df['is_school_vacation'] = df['month'].isin([4, 5, 12]).astype(int)
    df['is_exam_season'] = df['month'].isin([2, 3]).astype(int)

# Fill missing values
cols = ['seatcount_15', 'searchcount_15', 'seat_to_search_15', 'avg_seatcount_15', 'avg_searchcount_15']
train_merged[cols] = train_merged[cols].fillna(0)
test_merged[cols] = test_merged[cols].fillna(0)

# Add holiday feature
indian_holidays = holidays.India(years=[2023, 2024, 2025])
for df in [train_merged, test_merged]:
    df['is_holiday'] = df['doj'].apply(lambda x: 1 if x in indian_holidays else 0)

# Route average seatcount for grouping weight
route_avg = train.groupby(['srcid', 'destid'])['final_seatcount'].mean().to_dict()
weights = train['srcid'].astype(str) + "_" + train['destid'].astype(str)
weights = weights.map(lambda x: route_avg.get(x, 1))

# Final features
cat_cols = ['srcid', 'destid', 'srcid_region', 'destid_region', 'srcid_tier', 'destid_tier']
num_cols = ['seatcount_15', 'searchcount_15', 'seat_to_search_15', 'avg_seatcount_15', 'avg_searchcount_15',
            'day_of_week', 'month', 'week_of_year', 'is_weekend', 'is_holiday', 'is_wedding_season',
            'is_school_vacation', 'is_exam_season']
features = cat_cols + num_cols

# Convert categorical
for df in [train_merged, test_merged]:
    for col in cat_cols:
        df[col] = df[col].fillna('missing').astype('category')

# Prepare data
X = train_merged[features]
y = train_merged['final_seatcount']
X_test = test_merged[features]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_lgbm = np.zeros(len(X))
test_lgbm = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    model_lgbm = LGBMRegressor(
        n_estimators=6000,
        learning_rate=0.03,
        num_leaves=64,
        max_depth=8,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_alpha=1.2,
        reg_lambda=1.2,
        random_state=42
    )
    model_lgbm.fit(X.iloc[train_idx], y.iloc[train_idx],
                   eval_set=[(X.iloc[val_idx], y.iloc[val_idx])],
                   sample_weight=weights.iloc[train_idx],
                   callbacks=[early_stopping(300), log_evaluation(500)],
                   categorical_feature=cat_cols)
    oof_lgbm[val_idx] = model_lgbm.predict(X.iloc[val_idx])
    test_lgbm += model_lgbm.predict(X_test) / kf.n_splits

rmse = np.sqrt(mean_squared_error(y, oof_lgbm))
print(f"\n✅ Final Validation RMSE: {rmse:.4f}\n")

submission = test[['route_key']].copy()
submission['final_seatcount'] = np.clip(test_lgbm, 0, None)
submission.to_csv("submission.csv", index=False)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000865 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1170
[LightGBM] [Info] Number of data points in the train set: 53760, number of used features: 19
[LightGBM] [Info] Start training from score 2003.632533
Training until validation scores don't improve for 300 rounds
[500]	valid_0's l2: 144976
[1000]	valid_0's l2: 138160
[1500]	valid_0's l2: 135187
[2000]	valid_0's l2: 134062
[2500]	valid_0's l2: 133307
[3000]	valid_0's l2: 133027
[3500]	valid_0's l2: 132720
Early stopping, best iteration is:
[3513]	valid_0's l2: 132705
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001161 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1171
[LightGBM] 