### Import the Libraries

In [1]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

### Import the Data

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
transactions_data = pd.read_csv("transactions.csv")
# ----------------------------------------------
train_df = train_data.copy()
test_df = test_data.copy()
test_df = test_df.drop('route_key', axis = 1)
trans_df = transactions_data.copy()
#-------------------------------------------------

### Adding features from transactions and date

In [3]:
transaction_15 = trans_df[trans_df['dbd'] == 15]

features = transaction_15[['doj', 'srcid', 'destid', 'srcid_region', 'destid_region',
       'srcid_tier', 'destid_tier', 'cumsum_seatcount', 'cumsum_searchcount']]

train_df = train_df.merge(features, on = ['doj', 'srcid', 'destid'], how = 'left')
test_df = test_df.merge(features, on = ['doj', 'srcid', 'destid'], how = 'left')



In [4]:
dfs = [train_df, test_df]

for df in dfs:
    
    df['doj'] = pd.to_datetime(df['doj'], format = '%Y-%m-%d')

    # Extracting new columns with date columns
    df['doj' + '_year'] = df['doj'].dt.year
    df['doj' + '_month'] = df['doj'].dt.month
    df['doj' + '_day'] = df['doj'].dt.day
    df['doj' + '_dayofweek'] = df['doj'].dt.dayofweek
    df['doj' + '_isweekend'] = df['doj'].dt.dayofweek.isin([5,6]).astype(int)

    # Deleting the datetime datatype columns
    df.drop(['doj'], axis = 1, inplace = True)

### Preprocessing

In [5]:
categorical_columns = [col for col in train_df.columns if train_df[col].dtype == 'O']
# ['srcid_region', 'destid_region', 'srcid_tier', 'destid_tier']

encoder = OrdinalEncoder(dtype = int)

train_df[categorical_columns] = encoder.fit_transform(train_df[categorical_columns])
test_df[categorical_columns] = encoder.fit_transform(test_df[categorical_columns])

### Data Preparation

In [6]:
X = train_df.drop(['final_seatcount'], axis = 1)
y = train_df['final_seatcount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

### Model training and testing

In [7]:
model = XGBRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred)) # 504.88480654785326
print(rmse)

392.09103789169905


## Submission

In [8]:
model = XGBRegressor()
model.fit(X,y)
y_pred = model.predict(test_df)

final_df = pd.DataFrame(test_data['route_key'])
final_df['final_seatcount'] = pd.Series(y_pred)

final_df.to_csv('submission.csv', index = False)