In [None]:
import warnings
# To prevent warnings by XGBoost estimator 
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

%matplotlib inline

df = pd.read_csv('test/dataset.csv')

In [None]:
df['day_id'] = pd.to_datetime(df['day_id'], format='%Y%m%d')

In [None]:
print('Columns to delete first:')
to_drop_singles = [c for c in df.columns
                   if df[c].nunique() == 1]
df.drop(columns=to_drop_singles, inplace=True)

to_drop_singles

In [None]:
# How many samples don't have computed fp0 ?
null_fp_mask = df['fp0'].isnull()
print('Samples without fp0: %d' % len(df[null_fp_mask]))

# Select samples with present fp0 label
df = df[~null_fp_mask]
df = df.set_index(pd.Index(range(len(df))))

In [None]:
sns.distplot(df['fp0'], bins=20, fit=stats.norm)
plt.figure()
proba_plot = stats.probplot(df['fp0'], plot=plt)

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Explicit category setting to prevent warnings
encoder = OneHotEncoder(categories='auto')
is_home = np.reshape(df['is_home'].values, (-1, 1))
is_home_encoded = encoder.fit_transform(is_home).toarray()
df[['home', 'visitor']] = pd.DataFrame(is_home_encoded, index=df.index)

df.drop(columns='is_home', inplace=True)

In [None]:
# Count of days must be positive numbers!
neg_days_mask = df['p_days_since_last_game'] < 0
print(f'Count of days invalid values: {len(df[neg_days_mask])}')
df.loc[neg_days_mask, 'p_days_since_last_game'] = 0.0

In [None]:
year, month, day = lambda x: x.year, lambda x: x.month, lambda x: x.day
datetime_extraction = {'year': year, 'month': month, 'day': day}

for col in datetime_extraction.keys():
    df[col] = df['day_id'].map(datetime_extraction[col])

df.drop(columns='day_id', inplace=True)

In [None]:
# Check of presence player_id for each observation
null_ids = df[df['player_id'].isnull()]
print('Count of null player_id\'s: {}'.format(len(null_ids)))

In [None]:
# Players count
unique_ids = df['player_id'].unique()
print('Players count in frame: {}'.format(len(unique_ids)))

In [None]:
def columns_by_postfix(columns, postfix):
    result = [column for column in columns
              if postfix in column]
    result.sort()
    return result

def league_and_non_league_columns(columns, suffix):
    suf_columns = columns_by_postfix(columns, suffix)
    leagues = columns_by_postfix(suf_columns, 'league')
    non_leagues = [column for column in suf_columns
                   if column not in leagues]
    return non_leagues, leagues


cols = df.columns

single_cols, league_single_cols = league_and_non_league_columns(cols, 'single')
double_cols, league_double_cols = league_and_non_league_columns(cols, 'double')
triple_cols, league_triple_cols = league_and_non_league_columns(cols, 'triple')
er_cols, league_er_cols = league_and_non_league_columns(cols, 'earnedruns')
rbi_cols, league_rbi_cols = league_and_non_league_columns(cols, 'rbi')
ab_cols, league_ab_cols = league_and_non_league_columns(cols, 'atbat')
ha_cols, league_ha_cols = league_and_non_league_columns(cols, 'hitsagainst')
hr_cols, league_hr_cols = league_and_non_league_columns(cols, 'hrallowed')
iw_cols, league_iw_cols = league_and_non_league_columns(cols, 'intwalk')
sb_cols, league_sb_cols = league_and_non_league_columns(cols, 'stolen')
ss_cols, league_ss_cols = league_and_non_league_columns(cols, 'swinging')
so_cols, league_so_cols = league_and_non_league_columns(cols, 'strikeouts')
fp0_cols, league_fp0_cols = league_and_non_league_columns(cols, 'fp0_sma')
fp0var_cols, league_fp0var_cols =\
    league_and_non_league_columns(cols, 'fp0_var')
walk_cols, league_walk_cols = league_and_non_league_columns(cols, 'walk')
walk_cols = [col for col in walk_cols if 'int' not in col]
league_walk_cols = [col for col in league_walk_cols if 'int' not in col]
win_cols, league_win_cols = league_and_non_league_columns(cols, 'win_')
nh_cols, league_nh_cols = league_and_non_league_columns(cols, 'nohitter')
no_cols, league_no_cols = league_and_non_league_columns(cols, 'numouts')
np_cols, league_np_cols = league_and_non_league_columns(cols, 'numpitches')
cg_cols, league_cg_cols = league_and_non_league_columns(cols, 'completegame_')
cgs_cols, league_cgs_cols =\
    league_and_non_league_columns(cols, 'completegamesho_')
qu_cols, league_qu_cols = league_and_non_league_columns(cols, 'quality')
gc_cols, league_gc_cols = league_and_non_league_columns(cols, 'gamescount')

In [None]:
prefixes = ['atbat', 'single', 'triple', 'double', 'rbi',
            'earnedruns', 'hitsagainst', 'hr', 'intwalk', 'stolenbases',
            'swinging', 'strikeouts', 'walk', 'win', 'nohitter',
            'completegamesho', 'completegame', 'numpitches', 'numouts',
            'quality', 'fp0var', 'gamecount']

non_league_columns = [ab_cols, single_cols, triple_cols,
                      double_cols, rbi_cols, er_cols, ha_cols, hr_cols,
                      iw_cols, sb_cols, ss_cols, so_cols, walk_cols,
                      win_cols, nh_cols, cgs_cols, cg_cols, np_cols,
                      no_cols, qu_cols, fp0var_cols, gc_cols[1:-1]]

league_columns = [league_ab_cols, league_single_cols,
                  league_triple_cols, league_double_cols, league_rbi_cols,
                  league_er_cols, league_ha_cols, league_hr_cols,
                  league_iw_cols, league_sb_cols, league_ss_cols,
                  league_so_cols, league_walk_cols, league_win_cols,
                  league_nh_cols, league_cgs_cols, league_cg_cols,
                  league_np_cols, league_no_cols, league_qu_cols,
                  league_fp0var_cols, league_gc_cols]

In [None]:
from operator import itemgetter

indices = [0, 3, 5, 6, 7, 8, 11, 12, 13, 15, 16, 20]

#prefixes = list(itemgetter(*indices)(prefixes))
#non_league_columns = list(itemgetter(*indices)(non_league_columns))
#league_columns = list(itemgetter(*indices)(league_columns))

In [None]:
pp_limit_index = -5

# Feature engineering (part 1)
# Compute new features as average between similar features
# Ex.: new_feature = sum(old_features) / old_features_count

for cols, prefix in zip(non_league_columns[:pp_limit_index],
                        prefixes[:pp_limit_index]):
    pp_cols = columns_by_postfix(cols, 'pp')
    pg_cols = columns_by_postfix(cols, 'pg')
    df[prefix+'_sma_pp_avg'] = df[pp_cols].sum(axis=1) / len(pp_cols)
    df[prefix+'_sma_pg_avg'] = df[pg_cols].sum(axis=1) / len(pg_cols)
    
for cols, prefix in zip(non_league_columns[pp_limit_index:],
                        prefixes[pp_limit_index:]):
    pg_cols = columns_by_postfix(cols, 'pg')
    df[prefix+'_sma_pg_avg'] = df[pg_cols].sum(axis=1) / len(pg_cols)
    
for cols in non_league_columns:
    df.drop(columns=cols, inplace=True)

In [None]:
for cols, prefix in zip(league_columns[:pp_limit_index],
                        prefixes[:pp_limit_index]):
    pp_cols = columns_by_postfix(cols, 'pp')
    pg_cols = columns_by_postfix(cols, 'pg')
    df['l_'+prefix+'_sma_pp_avg'] = df[pp_cols].sum(axis=1) / len(pp_cols)
    df['l_'+prefix+'_sma_pg_avg'] = df[pg_cols].sum(axis=1) / len(pg_cols)
    
for cols, prefix in zip(league_columns[pp_limit_index:],
                        prefixes[pp_limit_index:]):
    pg_cols = columns_by_postfix(cols, 'pg')
    df['l_'+prefix+'_sma_pg_avg'] = df[pg_cols].sum(axis=1) / len(pg_cols)

for cols in league_columns:
    df.drop(columns=cols, inplace=True)

In [None]:
# Feature engineering (part 2)
avgs = columns_by_postfix(df.columns, 'avg')

# Transform averaged numerical statistics to paired neighbor difference
for col in avgs:
    df['diff_'+col] = np.diff(df[col],
                              prepend=df.loc[len(df)-1, col])

df.drop(columns=avgs, inplace=True)

In [None]:
diffs = columns_by_postfix(df.columns, 'diff_')

prefs = prefixes[:-5]
prefs.sort()

# Exclude columns with only pp or pg suffix
to_exclude = [10, 11, 28, 29, 38, 39, 40, 59, 60, 61]
pairs = [diffs[i] for i in range(len(diffs))
         if i not in to_exclude]

leagues = columns_by_postfix(pairs, '_l_')
non_leagues = [col for col in pairs
               if col not in leagues]

for i in range(0, len(non_leagues), 2):
    pref = prefs[i // 2]
    df[f'labs_{pref}'] = np.abs(df[leagues[i]] - df[leagues[i+1]])
    df[f'abs_{pref}'] = np.abs(df[non_leagues[i]] - df[non_leagues[i+1]])

In [None]:
for i in range(0, len(non_leagues), 2):
    pref = prefs[i // 2]
    df[f'avg_sum_{pref}'] = (df[non_leagues[i]] + df[non_leagues[i+1]]) / 2
    df[f'lavg_sum_{pref}'] = (df[leagues[i]] + df[leagues[i+1]]) / 2

In [None]:
ex = list(itemgetter(*to_exclude)(diffs))

# Average statistics for features without pair
# Ex.: new_feature = (league_feature + feature) / 2

df['e_avgsum_fp0'] = (df[ex[0]] + df[ex[2]]) / 2
df['e_avgsum_gamecount'] = (df[ex[1]] + df[ex[3]]) / 2
df['e_avgsum_numouts'] = (df[ex[4]] + df[ex[7]]) / 2
df['e_avgsum_numpitches'] = (df[ex[5]] + df[ex[8]]) / 2
df['e_avgsum_quality'] = (df[ex[6]] + df[ex[9]]) / 2

df.drop(columns=diffs, inplace=True)

In [None]:
pps = columns_by_postfix(fp0_cols, 'pp')
pgs = [col for col in fp0_cols if col not in pps]
pps.sort(key=len)
pgs.sort(key=len)

# len(pps) == len(pgs) !
for i in range(0, len(pps), 2):
    df[f'{i}_fp0_pp'] = (df[pps[i]] + df[pps[i+1]]) / 2
    df[f'{i}_fp0_pg'] = (df[pgs[i]] + df[pgs[i+1]]) / 2

df.drop(columns=fp0_cols, inplace=True)

In [None]:
pps = columns_by_postfix(league_fp0_cols, 'pp')
pgs = [col for col in league_fp0_cols if col not in pps]
pps.sort(key=len)
pgs.sort(key=len)

# len(pps) == len(pgs) !
for i in range(0, len(pps)-1, 2):
    df[f'l_{i}_fp0_pp'] = (df[pps[i]] + df[pps[i+1]]) / 2
    df[f'l_{i}_fp0_pg'] = (df[pgs[i]] + df[pgs[i+1]]) / 2

df.drop(columns=league_fp0_cols, inplace=True)

In [None]:
df.drop(columns=['player_id', 'year', 'day', 'month', 'game_id',
                 'winddir', 'status_pp', 'status_pg', 'height',
                 'p_hand', 'weight', 'precip', 'windspeed',
                 'gamescount_in_team', 'gamescount_with_oppteam_in_series'],
                 inplace=True)

In [None]:
fps = columns_by_postfix(df.columns, 'fp0_')

for i in range(0, len(fps), 2):
    df[f'{i}_fp0_'] = (df[fps[i]] + df[fps[i+1]]) / 2
    
df.drop(columns=fps, inplace=True)

print(f'Data finishing shape {df.shape}')

In [None]:
from sklearn.model_selection import train_test_split

reordered_cols = df.columns.drop('fp0').insert(0, 'fp0')
df = df.reindex(columns=reordered_cols)

X = df[df.columns[1:]]
Y = df[df.columns[0]]

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=0)

# Train-validation split
X_train, X_val, Y_train, Y_val = train_test_split(
    X_train, Y_train, test_size=0.2, random_state=0)

In [None]:
def adjusted_r2(x: pd.DataFrame, r2):
    nominator = (1 - r2) * (len(x) - 1)
    denominator = len(x) - len(x.columns) - 1
    return 1 - nominator / denominator

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import xgboost

k_fold = KFold(n_splits=6, shuffle=True, random_state=0)

xgb_regr = xgboost.XGBRegressor(random_state=1)
xgb_regr = xgb_regr.fit(X_train, Y_train)

Y_train_pred = xgb_regr.predict(X_train)
mse_train = mean_squared_error(Y_train, Y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(Y_train, Y_train_pred)
r2_adj_train = adjusted_r2(X_train, r2_train)
print(f'Train rmse: {rmse_train:.4f}')
print(f'Train adj. R2: {r2_adj_train:.4f}')

Y_pred = xgb_regr.predict(X_test)
mse = mean_squared_error(Y_test, Y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test, Y_pred)
r2_adj = adjusted_r2(X_test, r2)
print(f'Test rmse: {rmse:.4f}')
print(f'Test adj. R2: {r2_adj:.4f}')

cv_score = cross_val_score(xgb_regr, X_val, Y_val, cv=k_fold)
print(f'Cross validation mean: {cv_score.mean():.4f}')

In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(xgb_regr, X_train, Y_train, random_state=1)
perm_sorted_idx = result.importances_mean.argsort()

tree_importance_sorted_idx = np.argsort(xgb_regr.feature_importances_)
tree_indices = np.arange(0, len(xgb_regr.feature_importances_)) + 0.5

fig, ax = plt.subplots(figsize=(20, 35))
ax.barh(tree_indices,
        xgb_regr.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax.set_yticklabels(X.columns[tree_importance_sorted_idx])
ax.set_yticks(tree_indices)
ax.set_ylim((0, len(xgb_regr.feature_importances_)))
fig.tight_layout()

fig, ax = plt.subplots(figsize=(20, 35))
ax.boxplot(result.importances[perm_sorted_idx].T, vert=False,
            labels=X.columns[perm_sorted_idx])
fig.tight_layout()