# stacking

## Import packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

## Data preprocessing

In [None]:
data_train = pd.read_csv('trainset_w_lle.csv')
data_test = pd.read_csv('testset_w_lle.csv')
data_train.head()

In [None]:
# standardize
number_col = ['previous_5_to_10MA']

# Standardize features by removing the mean and scaling to unit variance.
xscaler = StandardScaler().fit(data_train[number_col])
#standardize feature values
data_train[number_col] = xscaler.transform(data_train[number_col])
data_test[number_col] = xscaler.transform(data_test[number_col])

In [None]:
data_train.drop(['venue', 'start_hour', 'start_time', 'game_page_url'], axis=1, inplace=True)
data_test.drop(['venue', 'start_hour', 'start_time', 'game_page_url'], axis=1, inplace=True)

In [None]:
data_train = pd.get_dummies(data_train, columns=['team1_name', 'team2_name', 'season_type'])
data_test = pd.get_dummies(data_test, columns=['team1_name', 'team2_name', 'season_type'])

In [None]:
train_cols = data_train.columns.tolist()
data_test = data_test[train_cols].copy()

In [None]:
print(set(data_train.columns) - set(data_test.columns))
print(set(data_test.columns) - set(data_train.columns))

In [None]:
data_train = shuffle(data_train)

In [None]:
y_train = data_train['attendance'].to_numpy()
y_test = data_test['attendance'].to_numpy()

data_train.drop(['attendance'], axis=1, inplace=True)
data_test.drop(['attendance'], axis=1, inplace=True)

x_train = data_train.to_numpy()
x_test = data_test.to_numpy()

print(x_train.shape)

## List all models

In [None]:
xgb = XGBRegressor(colsample_bytree=0.3, learning_rate=0.05, max_depth=8, n_estimators=1000, random_state=42)
forest = RandomForestClassifier(n_estimators=100,  random_state=123)
lgclassifier = LogisticRegression(random_state=123)

## stacking

In [None]:
estimators = [
     ('rf', forest),
     ('xgb', xgb)
]
sclf = StackingClassifier(estimators=estimators, final_estimator=lgclassifier, cv=10)

In [None]:
sclf.fit(x_train, y_train)

print()
print(f"Stacking classifier training Accuracy: {sclf.score(x_train, y_train):0.2f}")
print(f"Stacking classifier test Accuracy: {sclf.score(x_test, y_test):0.2f}")