# Import the Libraries

In [1]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

import optuna
import warnings
warnings.filterwarnings('ignore')

# Import the Data

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
transactions_data = pd.read_csv("transactions.csv")
# ----------------------------------------------
train_df = train_data.copy()
test_df = test_data.copy()
test_df = test_df.drop('route_key', axis = 1)
trans_df = transactions_data.copy()
#-------------------------------------------------

# Data Transformation

## Merge Data

In [3]:
transaction_15 = trans_df[trans_df['dbd'] == 15]

features = transaction_15[['doj', 'srcid', 'destid', 'srcid_region', 'destid_region',
       'srcid_tier', 'destid_tier', 'cumsum_seatcount', 'cumsum_searchcount']]

train_df = train_df.merge(features, on = ['doj', 'srcid', 'destid'], how = 'left')
test_df = test_df.merge(features, on = ['doj', 'srcid', 'destid'], how = 'left')

In [4]:
## Converting Dataype of doj

In [5]:
dfs = [train_df, test_df]

for df in dfs:    
    df['doj'] = pd.to_datetime(df['doj'], format = '%Y-%m-%d')

# Feature Engineering

## Extracting new columns with date columns

In [6]:
dfs = [train_df, test_df]

for df in dfs:
    
    df['doj' + '_year'] = df['doj'].dt.year
    df['doj' + '_month'] = df['doj'].dt.month
    df['doj' + '_day'] = df['doj'].dt.day
    df['doj' + '_dayofweek'] = df['doj'].dt.dayofweek
    df['doj' + '_isweekend'] = df['doj'].dt.dayofweek.isin([5,6]).astype(int)

## encode doj_month with sin/cos encoding to help model cyclical patterns:

In [7]:
train_df['doj_month_sin'] = np.sin(2 * np.pi * train_df['doj_month'] / 12)
train_df['doj_month_cos'] = np.cos(2 * np.pi * train_df['doj_month'] / 12)

test_df['doj_month_sin'] = np.sin(2 * np.pi * test_df['doj_month'] / 12)
test_df['doj_month_cos'] = np.cos(2 * np.pi * test_df['doj_month'] / 12)

In [8]:
monthly_agg_train = train_df.groupby('doj_month').agg(
    avg_cumsum=('cumsum_seatcount', 'mean'),
    trips=('cumsum_seatcount', 'count')
).reset_index()

train_df = train_df.merge(monthly_agg_train, on='doj_month', how='left')

monthly_agg_test = test_df.groupby('doj_month').agg(
    avg_cumsum=('cumsum_seatcount', 'mean'),
    trips=('cumsum_seatcount', 'count')
).reset_index()

test_df = test_df.merge(monthly_agg_test, on='doj_month', how = 'left')

In [9]:
train_df.head()

Unnamed: 0,doj,srcid,destid,final_seatcount,srcid_region,destid_region,srcid_tier,destid_tier,cumsum_seatcount,cumsum_searchcount,doj_year,doj_month,doj_day,doj_dayofweek,doj_isweekend,doj_month_sin,doj_month_cos,avg_cumsum,trips
0,2023-03-01,45,46,2838.0,Karnataka,Tamil Nadu,Tier 1,Tier 1,16.0,480.0,2023,3,1,2,0,1.0,6.123234000000001e-17,36.772419,6200
1,2023-03-01,46,45,2298.0,Tamil Nadu,Karnataka,Tier 1,Tier 1,34.0,352.0,2023,3,1,2,0,1.0,6.123234000000001e-17,36.772419,6200
2,2023-03-01,45,47,2720.0,Karnataka,Andhra Pradesh,Tier 1,Tier 1,36.0,892.0,2023,3,1,2,0,1.0,6.123234000000001e-17,36.772419,6200
3,2023-03-01,47,45,2580.0,Andhra Pradesh,Karnataka,Tier 1,Tier 1,18.0,1130.0,2023,3,1,2,0,1.0,6.123234000000001e-17,36.772419,6200
4,2023-03-01,46,9,4185.0,Tamil Nadu,Tamil Nadu,Tier 1,Tier2,48.0,1023.0,2023,3,1,2,0,1.0,6.123234000000001e-17,36.772419,6200


# Preprocessing

## Transforming Categorical Columns

In [10]:
categorical_columns = [col for col in train_df.columns if train_df[col].dtype == 'O']
# ['srcid_region', 'destid_region', 'srcid_tier', 'destid_tier']

encoder = OrdinalEncoder(dtype = int)

train_df[categorical_columns] = encoder.fit_transform(train_df[categorical_columns])
test_df[categorical_columns] = encoder.fit_transform(test_df[categorical_columns])

## Delete doj column

In [11]:
train_df = train_df.drop(['doj'], axis =1)
test_df = test_df.drop(['doj'], axis = 1)

# Data Preparation

In [12]:
X = train_df.drop(['final_seatcount'], axis = 1)
y = train_df['final_seatcount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Hyper-parameter Tuning

# Model training and testing

In [17]:
cv = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)

params = {'n_estimators': 1308, 
          'max_depth': 11, 
          'learning_rate': 0.08271400384945467, 
          'subsample': 0.9522311846284324, 
          'colsample_bytree': 0.6173075657509195, 
          'gamma': 2.534318254800432, 
          'reg_alpha': 28.608542170321446, 
          'reg_lambda': 9.564406519071592e-08, 
          'min_child_weight': 46.43542065154699}

model = XGBRegressor(**params)
scores = []
for fold, (train_index, test_index) in enumerate(cv.split(X,y)):
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f'#### Fold {fold} ####')
    print(f' RMSE = {rmse}')
    scores.append(rmse)

print(f'Average RMSE score = {np.mean(scores):.2f}') # 3330

#### Fold 0 ####
 RMSE = 336.96037321955083
#### Fold 1 ####
 RMSE = 325.7640826405754
#### Fold 2 ####
 RMSE = 329.7954161853825
#### Fold 3 ####
 RMSE = 331.8384537345266
#### Fold 4 ####
 RMSE = 330.2297737475059
Average RMSE score = 330.92


In [18]:
model.get_booster().get_score(importance_type='gain')

{'srcid': 1928730.625,
 'destid': 2393509.5,
 'srcid_region': 1703928.375,
 'destid_region': 1320462.625,
 'srcid_tier': 1519529.75,
 'destid_tier': 1412135.125,
 'cumsum_seatcount': 1292808.25,
 'cumsum_searchcount': 4637109.5,
 'doj_year': 488999.25,
 'doj_month': 427468.0625,
 'doj_day': 319363.78125,
 'doj_dayofweek': 737800.6875,
 'doj_isweekend': 572971.9375,
 'doj_month_sin': 486663.40625,
 'doj_month_cos': 728158.6875,
 'avg_cumsum': 503260.3125,
 'trips': 282838.6875}

# Submission

In [20]:
params = {'n_estimators': 1755, 
          'max_depth': 11, 
          'learning_rate': 0.03625453861957001, 
          'subsample': 0.7790066451555812, 
          'colsample_bytree': 0.6731412472295383, 
          'gamma': 2.4418878678623215, 
          'reg_alpha': 2.1597525096902664e-08, 
          'reg_lambda': 2.4766274326222894e-05, 
          'min_child_weight': 37.88062782224772}

model = XGBRegressor(**params)
model.fit(X,y)
y_pred = model.predict(test_df)

final_df = pd.DataFrame(test_data['route_key'])
final_df['final_seatcount'] = pd.Series(y_pred)

final_df.to_csv('submission.csv', index = False)