# Setup Libraries and Data

## Import the Libraries

In [26]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor # 472
from xgboost import XGBRegressor #450
from lightgbm import LGBMRegressor #472

import optuna
import warnings
warnings.filterwarnings('ignore')

## Import the Data

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
transactions_data = pd.read_csv("transactions.csv")
# ----------------------------------------------
train_df = train_data.copy()
test_df = test_data.copy()
test_df = test_df.drop('route_key', axis = 1)
trans_df = transactions_data.copy()
#-------------------------------------------------
# Loads Holiays file
holiday_df = pd.read_csv('indian_holidays_2023_2025.csv')
holiday_df = holiday_df[holiday_df['type'] == 'Restricted Holiday']
holiday_df = holiday_df.drop(['type', 'description','country', 'created_at', 'updated_at'], axis = 1)
holiday_df = holiday_df.rename({'date': 'doh'}, axis = 1)

# Feature Engineering and Transforming

## Converting Date columns dtype

In [3]:
# object into datetime
dfs = [train_df, test_df, trans_df, holiday_df]
date_cols = ['doj', 'doi', 'doh']

for df in dfs:
    for col in date_cols:
        if col in df.columns and df[col].dtype != 'datetime64[ns]':
            df[col] = pd.to_datetime(df[col])

## Adding new features to trans_df

In [4]:
# Set of holiday dates
holiday_dates = set(holiday_df['doh'])

# 1. Is DOJ a holiday?
trans_df['is_doj_holiday'] = trans_df['doj'].isin(holiday_dates).astype(int)

# 2. Is DOI a holiday?
trans_df['is_doi_holiday'] = trans_df['doi'].isin(holiday_dates).astype(int)

# 3. Number of holidays between DOI and DOJ
trans_df['num_holidays_between'] = trans_df.apply(
    lambda row: sum((holiday_df['doh'] >= row['doi']) & (holiday_df['doh'] <= row['doj'])),
    axis=1
)

# 4. Days between DOI and DOJ
# dbd = >trans_df['days_to_travel'] = (trans_df['doj'] - trans_df['doi']).dt.days

# 5. Day of week
trans_df['doj_dayofweek'] = trans_df['doj'].dt.dayofweek  # Monday=0
trans_df['doi_dayofweek'] = trans_df['doi'].dt.dayofweek

# 6. Is DOJ on weekend?
trans_df['is_doj_weekend'] = trans_df['doj_dayofweek'].isin([5, 6]).astype(int)  # Saturday or Sunday

# 7. Is DOJ close to holiday (within ±1 day)?
trans_df['doj_near_holiday'] = trans_df['doj'].apply(
    lambda d: any((abs((d - h).days) <= 1) for h in holiday_dates)
).astype(int)

# Preprocessing

## transforming categorical columns

In [5]:
# Using OrdinalEncoder
cat_columns_trans_df = [col for col in trans_df.columns if trans_df[col].dtype == 'O']
# ['srcid_region', 'destid_region', 'srcid_tier', 'destid_tier']

encoder = OrdinalEncoder()

trans_df[cat_columns_trans_df] = encoder.fit_transform(trans_df[cat_columns_trans_df])

In [7]:
trans_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2266100 entries, 0 to 2266099
Data columns (total 18 columns):
 #   Column                Dtype         
---  ------                -----         
 0   doj                   datetime64[ns]
 1   doi                   datetime64[ns]
 2   srcid                 int64         
 3   destid                int64         
 4   srcid_region          float64       
 5   destid_region         float64       
 6   srcid_tier            float64       
 7   destid_tier           float64       
 8   cumsum_seatcount      float64       
 9   cumsum_searchcount    float64       
 10  dbd                   int64         
 11  is_doj_holiday        int64         
 12  is_doi_holiday        int64         
 13  num_holidays_between  int64         
 14  doj_dayofweek         int32         
 15  doi_dayofweek         int32         
 16  is_doj_weekend        int64         
 17  doj_near_holiday      int64         
dtypes: datetime64[ns](2), float64(6), int32(2)

## Merge trans_df to train_df & test_df

In [8]:
transaction_15 = trans_df[trans_df['dbd'] == 15]

features = transaction_15[['doj', 'srcid', 'destid', 'srcid_region', 'destid_region',
       'srcid_tier', 'destid_tier', 'cumsum_seatcount', 'cumsum_searchcount',
       'is_doj_holiday', 'is_doi_holiday', 'num_holidays_between',
       'doj_dayofweek', 'doi_dayofweek', 'is_doj_weekend', 'doj_near_holiday']]

train_df = train_df.merge(features, on = ['doj', 'srcid', 'destid'], how = 'left')
test_df = test_df.merge(features, on = ['doj', 'srcid', 'destid'], how = 'left')

## Deleting datetime columns

In [12]:
train_df = train_df.drop(['doj'], axis = 1)
test_df = test_df.drop(['doj'], axis = 1

# Data Preparation

In [15]:
X = train_df.drop(['final_seatcount'], axis = 1)
y = train_df['final_seatcount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [25]:
model = XGBRegressor()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred)) # 450.77
rmse

np.float64(450.7731434881524)

In [29]:
train_df.corr()['final_seatcount']

srcid                   0.125972
destid                  0.129146
final_seatcount         1.000000
srcid_region            0.120402
destid_region           0.123752
srcid_tier             -0.073298
destid_tier            -0.069496
cumsum_seatcount        0.492531
cumsum_searchcount      0.412472
is_doj_holiday          0.020789
is_doi_holiday         -0.015712
num_holidays_between   -0.007209
doj_dayofweek           0.148771
doi_dayofweek           0.128933
is_doj_weekend          0.128512
doj_near_holiday        0.028642
Name: final_seatcount, dtype: float64

# Hyper-parameter Tuning

def xgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 100.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 100.0, log=True),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 300),
        'tree_method': 'hist',
        'random_state': 42,
    }

    model = XGBRegressor(**params)
    model.fit(X_train, y_train,
                eval_set = [(X_test, y_test)],
                
                verbose = 0)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    return rmse

    
# Creating the study
study = optuna.create_study(
                study_name = 'My_XGB_finetuning',
                direction = 'minimize',
                storage = 'sqlite:///optuna_study_xgb_FE.db',
                load_if_exists = True
                )


print('Starting optimizing for XGBRegressor')
study.optimize(xgb_objective, n_trials = 50)

print('\n Optimization finished')
print(f'Number of finished trials: {study.trials}')

print('Best trial')

best_trial = study.best_trial
print(f"  Best RMSE: {best_trial.value:.4f}")
print(f"  Best parameter:")
for key, value in study.best_trial.params.items():
    print(f' {key}= {value}')

# Model training and testing

In [30]:
cv = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)

params = {'n_estimators': 1162, 'max_depth': 7, 'learning_rate': 0.0327176104791863, 'subsample': 0.9050082641305782, 'colsample_bytree': 0.8624043845899972, 'gamma': 7.183492602123284, 'reg_alpha': 0.00021571251148241148, 'reg_lambda': 0.20903498293701747, 'min_child_weight': 9.229715507233834}
model = XGBRegressor(**params)
scores = []
for fold, (train_index, test_index) in enumerate(cv.split(X,y)):
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f'#### Fold {fold} ####')
    print(f' RMSE = {rmse}')
    scores.append(rmse)

print(f'Average RMSE score = {np.mean(scores):.2f}')

#### Fold 0 ####
 RMSE = 434.81932499254907
#### Fold 1 ####
 RMSE = 430.1161171446376
#### Fold 2 ####
 RMSE = 432.3394457853122
#### Fold 3 ####
 RMSE = 430.7588860857816
#### Fold 4 ####
 RMSE = 430.9163131994676
Average RMSE score = 431.79


In [31]:
model.get_booster().get_score(importance_type='gain')

{'srcid': 11593849.0,
 'destid': 11507363.0,
 'srcid_region': 6473203.5,
 'destid_region': 9533611.0,
 'srcid_tier': 7645406.0,
 'destid_tier': 10358826.0,
 'cumsum_seatcount': 9301697.0,
 'cumsum_searchcount': 18303756.0,
 'is_doj_holiday': 1292288.0,
 'is_doi_holiday': 1640148.125,
 'num_holidays_between': 1948892.0,
 'doj_dayofweek': 2916126.0,
 'doi_dayofweek': 3866566.75,
 'is_doj_weekend': 2650176.75,
 'doj_near_holiday': 2585595.0}

# Submission

In [18]:
params = {'n_estimators': 1755, 
          'max_depth': 11, 
          'learning_rate': 0.03625453861957001, 
          'subsample': 0.7790066451555812, 
          'colsample_bytree': 0.6731412472295383, 
          'gamma': 2.4418878678623215, 
          'reg_alpha': 2.1597525096902664e-08, 
          'reg_lambda': 2.4766274326222894e-05, 
          'min_child_weight': 37.88062782224772}

model = XGBRegressor(**params)
model.fit(X,y)
y_pred = model.predict(test_df)

final_df = pd.DataFrame(test_data['route_key'])
final_df['final_seatcount'] = pd.Series(y_pred)

final_df.to_csv('submission.csv', index = False)