### Import the Libraries

In [30]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

import optuna
import warnings
warnings.filterwarnings('ignore')

### Import the Data

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
transactions_data = pd.read_csv("transactions.csv")
# ----------------------------------------------
train_df = train_data.copy()
test_df = test_data.copy()
test_df = test_df.drop('route_key', axis = 1)
trans_df = transactions_data.copy()
#-------------------------------------------------

### Adding features from transactions and date

In [3]:
transaction_15 = trans_df[trans_df['dbd'] == 15]

features = transaction_15[['doj', 'srcid', 'destid', 'srcid_region', 'destid_region',
       'srcid_tier', 'destid_tier', 'cumsum_seatcount', 'cumsum_searchcount']]

train_df = train_df.merge(features, on = ['doj', 'srcid', 'destid'], how = 'left')
test_df = test_df.merge(features, on = ['doj', 'srcid', 'destid'], how = 'left')



In [4]:
dfs = [train_df, test_df]

for df in dfs:
    
    df['doj'] = pd.to_datetime(df['doj'], format = '%Y-%m-%d')

    # Extracting new columns with date columns
    df['doj' + '_year'] = df['doj'].dt.year
    df['doj' + '_month'] = df['doj'].dt.month
    df['doj' + '_day'] = df['doj'].dt.day
    df['doj' + '_dayofweek'] = df['doj'].dt.dayofweek
    df['doj' + '_isweekend'] = df['doj'].dt.dayofweek.isin([5,6]).astype(int)

    # Deleting the datetime datatype columns
    df.drop(['doj'], axis = 1, inplace = True)

### Preprocessing

In [5]:
categorical_columns = [col for col in train_df.columns if train_df[col].dtype == 'O']
# ['srcid_region', 'destid_region', 'srcid_tier', 'destid_tier']

encoder = OrdinalEncoder(dtype = int)

train_df[categorical_columns] = encoder.fit_transform(train_df[categorical_columns])
test_df[categorical_columns] = encoder.fit_transform(test_df[categorical_columns])

### Data Preparation

In [6]:
X = train_df.drop(['final_seatcount'], axis = 1)
y = train_df['final_seatcount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

### Hyper-parameter Tuning

### Model training and testing

In [49]:
cv = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)

params = {'n_estimators': 1755, 
          'max_depth': 11, 
          'learning_rate': 0.03625453861957001, 
          'subsample': 0.7790066451555812, 
          'colsample_bytree': 0.6731412472295383, 
          'gamma': 2.4418878678623215, 
          'reg_alpha': 2.1597525096902664e-08, 
          'reg_lambda': 2.4766274326222894e-05, 
          'min_child_weight': 37.88062782224772}
model = XGBRegressor(**params)
scores = []
for fold, (train_index, test_index) in enumerate(cv.split(X,y)):
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f'#### Fold {fold} ####')
    print(f' RMSE = {rmse}')
    scores.append(rmse)

print(f'Average RMSE score = {np.mean(scores):.2f}')

#### Fold 0 ####
 RMSE = 341.87024222245617
#### Fold 1 ####
 RMSE = 331.6364993678596
#### Fold 2 ####
 RMSE = 333.2831622600326
#### Fold 3 ####
 RMSE = 337.60424233100167
#### Fold 4 ####
 RMSE = 337.1076564377658
Average RMSE score = 336.30


## Submission

In [54]:
params = {'n_estimators': 1755, 
          'max_depth': 11, 
          'learning_rate': 0.03625453861957001, 
          'subsample': 0.7790066451555812, 
          'colsample_bytree': 0.6731412472295383, 
          'gamma': 2.4418878678623215, 
          'reg_alpha': 2.1597525096902664e-08, 
          'reg_lambda': 2.4766274326222894e-05, 
          'min_child_weight': 37.88062782224772}

model = XGBRegressor(**params)
model.fit(X,y)
y_pred = model.predict(test_df)

final_df = pd.DataFrame(test_data['route_key'])
final_df['final_seatcount'] = pd.Series(y_pred)

final_df.to_csv('submission.csv', index = False)