In [None]:
# Importing libraries
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load files
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
trans = pd.read_csv('transactions.csv')

In [None]:
# Keep only data from 15 days before departure
trans_15 = trans[trans['dbd'] == 15].copy()

In [None]:
# Feature engineering
trans_15['doj'] = pd.to_datetime(trans_15['doj'])
trans_15['weekday'] = trans_15['doj'].dt.dayofweek
trans_15['month'] = trans_15['doj'].dt.month
trans_15['is_weekend'] = trans_15['weekday'].isin([5,6]).astype(int)

In [None]:
# Convert DOJ in both train and trans_15 to datetime
train['doj'] = pd.to_datetime(train['doj'])
test['doj'] = pd.to_datetime(test['doj'])

In [None]:
# Merge with train/test
train_full = train.merge(trans_15, on=['doj', 'srcid', 'destid'], how='left')
test_full = test.merge(trans_15, on=['doj', 'srcid', 'destid'], how='left')


In [None]:
# Route-level historical average bookings
route_avg = train.groupby(['srcid', 'destid'])['final_seatcount'].mean().reset_index()
route_avg.rename(columns={'final_seatcount': 'route_avg_bookings'}, inplace=True)


In [None]:
# Merge route_avg into both train_full and test_full
train_full = train_full.merge(route_avg, on=['srcid', 'destid'], how='left')
test_full = test_full.merge(route_avg, on=['srcid', 'destid'], how='left')


In [None]:
# Create and encode city_pair
train_full['city_pair'] = train_full['srcid'].astype(str) + '_' + train_full['destid'].astype(str)
test_full['city_pair'] = test_full['srcid'].astype(str) + '_' + test_full['destid'].astype(str)


In [None]:
le = LabelEncoder()
train_full['city_pair'] = le.fit_transform(train_full['city_pair'])
test_full['city_pair'] = le.transform(test_full['city_pair'])

In [None]:
train_full['log_cumsum_seatcount'] = np.log1p(train_full['cumsum_seatcount'])
test_full['log_cumsum_seatcount'] = np.log1p(test_full['cumsum_seatcount'])




In [None]:
# Feature selection
features = ['cumsum_seatcount', 'cumsum_searchcount', 'srcid_tier', 'destid_tier',
            'srcid_region', 'destid_region', 'weekday', 'month', 'is_weekend', 'city_pair', 'route_avg_bookings', 'log_cumsum_seatcount']
target = 'final_seatcount'


In [None]:
cat_cols = ['srcid_tier', 'destid_tier', 'srcid_region', 'destid_region']
for col in cat_cols:
    train_full[col] = train_full[col].astype('category')
    test_full[col] = test_full[col].astype('category')


In [None]:
# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(train_full[features], train_full[target], test_size=0.2, random_state=42)

In [None]:
!pip install catboost



In [None]:
from catboost import CatBoostRegressor, Pool

# Define the Pool (CatBoost's data structure)
train_pool = Pool(data=X_train, label=y_train, cat_features=[features.index(f) for f in cat_cols])
val_pool = Pool(data=X_val, label=y_val, cat_features=[features.index(f) for f in cat_cols])

# Optimized CatBoostRegressor
model = CatBoostRegressor(
    iterations=4000,              # More iterations
    learning_rate=0.03,           # Lower learning rate for better generalization
    depth=7,                      # Slightly deeper trees
    loss_function='RMSE',         # As required by your leaderboard
    eval_metric='RMSE',
    early_stopping_rounds=100,    # Stops if no improvement
    random_seed=42,
    verbose=100
)

# Train with early stopping
model.fit(train_pool, eval_set=val_pool)

# Predict
val_preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print("Validation RMSE:", rmse)



0:	learn: 1169.2687862	test: 1175.4739770	best: 1175.4739770 (0)	total: 123ms	remaining: 8m 12s
100:	learn: 551.4201143	test: 551.2399597	best: 551.2399597 (100)	total: 7.98s	remaining: 5m 8s
200:	learn: 490.0359887	test: 492.9143404	best: 492.9143404 (200)	total: 13.8s	remaining: 4m 21s
300:	learn: 462.0723959	test: 468.3007851	best: 468.3007851 (300)	total: 21s	remaining: 4m 17s
400:	learn: 445.9313398	test: 454.7688937	best: 454.7688937 (400)	total: 26.9s	remaining: 4m 1s
500:	learn: 435.0447009	test: 446.4941980	best: 446.4941980 (500)	total: 34.4s	remaining: 4m
600:	learn: 427.3077315	test: 441.3718277	best: 441.3718277 (600)	total: 40s	remaining: 3m 46s
700:	learn: 421.4449868	test: 437.7561707	best: 437.7561707 (700)	total: 47.2s	remaining: 3m 41s
800:	learn: 417.4846931	test: 435.6844526	best: 435.6844526 (800)	total: 52.4s	remaining: 3m 29s
900:	learn: 413.6779221	test: 433.5699302	best: 433.5676220 (899)	total: 59.6s	remaining: 3m 24s
1000:	learn: 410.3957542	test: 432.233453

In [None]:
# Validation RMSE
val_preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print("RMSE on validation set:", rmse)


RMSE on validation set: 418.12531778561305


In [None]:
# Test prediction
test_preds = model.predict(test_full[features])
test['final_seatcount'] = test_preds


In [None]:
# Save submission
test[['route_key', 'final_seatcount']].to_csv("submission.csv", index=False)