# XGBoost

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

## Read dataset

In [2]:
data_train = pd.read_csv('processed_training.csv')
data_test = pd.read_csv('processed_test.csv')
data_train.head()

Unnamed: 0,attendance,is_federal_holiday,venue,on_grass,temperature,wind_speed,team1_name,team1_pre_win,team1_pre_loss,team1_pre_win_pct,...,weather_Rain,weather_Sunny,season_type,season,home_team_avg_att_last_year,start_hour,start_hour_label_afternoon,start_hour_label_evening,start_hour_label_night,start_hour_label_noon
0,35055,0,Wrigley Field,1,-2.786672,-0.09309,STL,-1.62645,-1.654514,0.0,...,0,1,regular,2015,0.345307,19,0,1,0,0
1,49043,0,Chase Field,1,0.705205,-0.09309,SFG,-1.62645,-1.654514,0.0,...,0,1,regular,2015,-0.573314,19,0,1,0,0
2,43633,0,Great American Ball Park,1,-0.52167,1.490839,PIT,-1.62645,-1.654514,0.0,...,0,0,regular,2015,0.066649,16,1,0,0,0
3,45030,0,Comerica Park,1,-2.031671,-0.489072,MIN,-1.62645,-1.654514,0.0,...,0,0,regular,2015,0.766348,13,0,0,0,1
4,43753,0,Minute Maid Park,1,0.516455,1.292848,CLE,-1.62645,-1.654514,0.0,...,0,0,regular,2015,-1.084575,18,0,1,0,0


In [3]:
data_train.drop(['venue', 'start_hour'], axis=1, inplace=True)
data_test.drop(['venue', 'start_hour'], axis=1, inplace=True)

In [4]:
data_train = pd.get_dummies(data_train, columns=['team1_name'])
data_test = pd.get_dummies(data_test, columns=['team1_name'])

In [5]:
data_train = pd.get_dummies(data_train, columns=['team2_name'])
data_test = pd.get_dummies(data_test, columns=['team2_name'])

In [6]:
data_train = pd.get_dummies(data_train, columns=['season_type'])
data_test = pd.get_dummies(data_test, columns=['season_type'])

In [7]:
set(data_train.columns) - set(data_test.columns)

set()

In [8]:
from sklearn.utils import shuffle
data_train = shuffle(data_train)

In [9]:
y_train = data_train['attendance'].to_numpy()
y_test = data_test['attendance'].to_numpy()

data_train.drop(['attendance'], axis=1, inplace=True)
data_test.drop(['attendance'], axis=1, inplace=True)

x_train = data_train.to_numpy()
x_test = data_test.to_numpy()

print(x_train.shape)

(12270, 273)


## kfold

In [10]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
kf.get_n_splits(x_train)

5

In [11]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

In [12]:
parameters = {'n_estimators': [100, 500, 1000], 'max_depth':[3, 4, 5, 6, 7, 8], 'learning_rate': [0.1, 0.01, 0.001], 'colsample_bytree': [0.3, 0.7]}
model = XGBRegressor(random_state=42)
clf = GridSearchCV(model, parameters, cv=5, n_jobs=-1, scoring='neg_root_mean_squared_error')
clf.fit(x_train, y_train)

print("Best parameters:", clf.best_params_)

## Train with best params

In [None]:
xgb = XGBRegressor(colsample_bytree= 0.3, learning_rate=0.01, max_depth= 3, n_estimators=1000)
xgb = xgb.fit(x_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
train_predict = xgb.predict(x_train)
print("Train MSE:", mean_squared_error(y_train, train_predict))
print("Train RMSE:", mean_squared_error(y_train, train_predict, squared=False))
print("Train MAE:", mean_absolute_error(y_train, train_predict))
print("Train MAPE:", mean_absolute_percentage_error(y_train, train_predict))

## Testing set

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
y_pred = xgb.predict(x_test)
print("Train MSE:", mean_squared_error(y_test, y_pred))
print("Train RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("Train MAE:", mean_absolute_error(y_test, y_pred))
print("Train MAPE:", mean_absolute_percentage_error(y_test, y_pred))