In [None]:
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.metrics import accuracy_score, log_loss
import tensorflow as tf
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras import layers, Sequential
from scikeras.wrappers import KerasClassifier

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

  train = pd.read_csv('train.csv')


In [None]:
np.random.seed(123)
tf.random.set_seed(123)

MASK = -666
T_HIST = 10
CLASS = 3
DEBUG = False
if DEBUG:
    train = train[:10000]

In [None]:
train.dropna(subset=['home_team_history_match_date_1'], inplace = True)
if False:
    train['league_id_count'] = train.groupby('league_id')['id'].transform('count')
    train = train.loc[train['league_id_count'] > 15]

In [None]:
print(f"Train: {train.shape} \n Submission: {submission.shape}")
train.head()

Train: (109779, 190) 
 Submission: (72711, 4)


Unnamed: 0,id,target,home_team_name,away_team_name,match_date,league_name,league_id,is_cup,home_team_coach_id,away_team_coach_id,...,away_team_history_league_id_1,away_team_history_league_id_2,away_team_history_league_id_3,away_team_history_league_id_4,away_team_history_league_id_5,away_team_history_league_id_6,away_team_history_league_id_7,away_team_history_league_id_8,away_team_history_league_id_9,away_team_history_league_id_10
0,11906497,away,Newell's Old Boys,River Plate,2019-12-01 00:45:00,Superliga,636,False,468196.0,468200.0,...,1122.0,642.0,636.0,636.0,636.0,1122.0,636.0,642.0,636.0,1122.0
1,11984383,home,Real Estelí,Deportivo Las Sabanas,2019-12-01 01:00:00,Primera Division,752,False,516788.0,22169161.0,...,752.0,752.0,752.0,752.0,752.0,752.0,752.0,752.0,752.0,752.0
2,11983301,draw,UPNFM,Marathón,2019-12-01 01:00:00,Liga Nacional,734,False,2510608.0,456313.0,...,734.0,734.0,734.0,734.0,734.0,734.0,734.0,734.0,734.0,734.0
3,11983471,away,León,Morelia,2019-12-01 01:00:00,Liga MX,743,False,1552508.0,465797.0,...,743.0,743.0,743.0,743.0,743.0,743.0,743.0,743.0,746.0,743.0
4,11883005,home,Cobán Imperial,Iztapa,2019-12-01 01:00:00,Liga Nacional,705,False,429958.0,426870.0,...,705.0,705.0,705.0,705.0,705.0,705.0,705.0,705.0,705.0,705.0


In [None]:
for col in train.filter(regex='date', axis=1).columns:
    train[col] = pd.to_datetime(train[col])
    test[col] = pd.to_datetime(test[col])

def add_features(df):
    for i in range(1, 11):
        df[f'home_team_history_match_DIFF_day_{i}'] = (df['match_date'] - df[f'home_team_history_match_date_{i}']).dt.days
        df[f'away_team_history_match_DIFF_days_{i}'] = (df['match_date'] - df[f'away_team_history_match_date_{i}']).dt.days
        df[f'home_team_history_DIFF_goal_{i}'] = df[f'home_team_history_goal_{i}'] - df[f'home_team_history_opponent_goal_{i}']
        df[f'away_team_history_DIFF_goal_{i}'] = df[f'away_team_history_goal_{i}'] - df[f'away_team_history_opponent_goal_{i}']
        df[f'home_winner_{i}'] = np.where(df[f'home_team_history_DIFF_goal_{i}'] > 0, 1., 0.)
        df[f'home_loser_{i}'] = np.where(df[f'home_team_history_DIFF_goal_{i}'] < 0, 1., 0.)
        df[f'away_winner_{i}'] = np.where(df[f'away_team_history_DIFF_goal_{i}'] > 0, 1., 0.)
        df[f'away_loser_{i}'] = np.where(df[f'away_team_history_DIFF_goal_{i}'] < 0, 1., 0.)
        df[f'home_team_result_{i}'] = np.where(df[f'home_team_history_DIFF_goal_{i}'] > 0., 2.,
                         (np.where(df[f'home_team_history_DIFF_goal_{i}'] == 0., 1,
                                   np.where(df[f'home_team_history_DIFF_goal_{i}'].isna(), np.nan, 0))))
        df[f'away_team_result_{i}'] = np.where(df[f'away_team_history_DIFF_goal_{i}'] > 0., 2.,
                         (np.where(df[f'away_team_history_DIFF_goal_{i}'] == 0., 1.,
                                   np.where(df[f'away_team_history_DIFF_goal_{i}'].isna(), np.nan, 0.))))
        df[f'home_team_history_ELO_rating_{i}'] = 1/(1+10**((df[f'home_team_history_opponent_rating_{i}']-df[f'home_team_history_rating_{i}'])/10))
        df[f'away_team_history_ELO_rating_{i}'] = 1/(1+10**((df[f'away_team_history_opponent_rating_{i}']-df[f'away_team_history_rating_{i}'])/10))
        df[f'home_away_team_history_ELO_rating_{i}'] = 1/(1+10**((df[f'away_team_history_rating_{i}']-df[f'home_team_history_rating_{i}'])/10))
        df[f'home_team_history_SAME_coaX_{i}'] = np.where(df['home_team_coach_id']==df[f'home_team_history_coach_{i}'],1,0)
        df[f'away_team_history_SAME_coaX_{i}'] = np.where(df['away_team_coach_id']==df[f'away_team_history_coach_{i}'],1,0)
        df[f'home_team_history_SAME_leaG_{i}'] = np.where(df['league_id']==df[f'home_team_history_league_id_{i}'],1,0)
        df[f'away_team_history_SAME_leaG_{i}'] = np.where(df['league_id']==df[f'away_team_history_league_id_{i}'],1,0)

    return df

train = add_features(train)
test = add_features(test)

  df[f'away_loser_{i}'] = np.where(df[f'away_team_history_DIFF_goal_{i}'] < 0, 1., 0.)
  df[f'home_team_result_{i}'] = np.where(df[f'home_team_history_DIFF_goal_{i}'] > 0., 2.,
  df[f'away_team_result_{i}'] = np.where(df[f'away_team_history_DIFF_goal_{i}'] > 0., 2.,
  df[f'home_team_history_ELO_rating_{i}'] = 1/(1+10**((df[f'home_team_history_opponent_rating_{i}']-df[f'home_team_history_rating_{i}'])/10))
  df[f'away_team_history_ELO_rating_{i}'] = 1/(1+10**((df[f'away_team_history_opponent_rating_{i}']-df[f'away_team_history_rating_{i}'])/10))
  df[f'home_away_team_history_ELO_rating_{i}'] = 1/(1+10**((df[f'away_team_history_rating_{i}']-df[f'home_team_history_rating_{i}'])/10))
  df[f'home_team_history_SAME_coaX_{i}'] = np.where(df['home_team_coach_id']==df[f'home_team_history_coach_{i}'],1,0)
  df[f'away_team_history_SAME_coaX_{i}'] = np.where(df['away_team_coach_id']==df[f'away_team_history_coach_{i}'],1,0)
  df[f'home_team_history_SAME_leaG_{i}'] = np.where(df['league_id']==df[f'h

In [None]:
train_y = train['target'].copy()
train_x = train.drop(['target', 'home_team_name', 'away_team_name'], axis=1)
train_x.drop(train.filter(regex='date').columns, axis=1, inplace = True)
train_x.drop(train.filter(regex='league').columns, axis=1, inplace = True)
train_x.drop(train.filter(regex='coach').columns, axis=1, inplace = True)
test_x = test.drop(['home_team_name', 'away_team_name'], axis=1)
test_x.drop(test.filter(regex='date').columns, axis=1, inplace = True)
test_x.drop(test.filter(regex='league').columns, axis=1, inplace = True)
test_x.drop(test.filter(regex='coach').columns, axis=1, inplace = True)

In [None]:
print(f"Target: {train_y.shape} \n Train shape: {train_x.shape} \n Test: {test_x.shape}")
print(f"Column names: {list(train_x.columns)}")

Target: (109779,) 
 Train shape: (109779, 292) 
 Test: (72711, 292)
Column names: ['id', 'is_cup', 'home_team_history_is_play_home_1', 'home_team_history_is_play_home_2', 'home_team_history_is_play_home_3', 'home_team_history_is_play_home_4', 'home_team_history_is_play_home_5', 'home_team_history_is_play_home_6', 'home_team_history_is_play_home_7', 'home_team_history_is_play_home_8', 'home_team_history_is_play_home_9', 'home_team_history_is_play_home_10', 'home_team_history_is_cup_1', 'home_team_history_is_cup_2', 'home_team_history_is_cup_3', 'home_team_history_is_cup_4', 'home_team_history_is_cup_5', 'home_team_history_is_cup_6', 'home_team_history_is_cup_7', 'home_team_history_is_cup_8', 'home_team_history_is_cup_9', 'home_team_history_is_cup_10', 'home_team_history_goal_1', 'home_team_history_goal_2', 'home_team_history_goal_3', 'home_team_history_goal_4', 'home_team_history_goal_5', 'home_team_history_goal_6', 'home_team_history_goal_7', 'home_team_history_goal_8', 'home_team_hist

In [None]:
train_x.head()

Unnamed: 0,id,is_cup,home_team_history_is_play_home_1,home_team_history_is_play_home_2,home_team_history_is_play_home_3,home_team_history_is_play_home_4,home_team_history_is_play_home_5,home_team_history_is_play_home_6,home_team_history_is_play_home_7,home_team_history_is_play_home_8,...,away_loser_10,home_team_result_10,away_team_result_10,home_team_history_ELO_rating_10,away_team_history_ELO_rating_10,home_away_team_history_ELO_rating_10,home_team_history_SAME_coaX_10,away_team_history_SAME_coaX_10,home_team_history_SAME_leaG_10,away_team_history_SAME_leaG_10
0,11906497,False,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,2.0,2.0,0.698653,0.570885,0.567616,1,1,1,0
1,11984383,False,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,2.0,0.711318,0.204827,0.718473,1,1,1,1
2,11983301,False,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,1.0,0.587305,0.805052,0.281921,1,1,1,1
3,11983471,False,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.691667,0.368863,0.642603,1,1,1,1
4,11883005,False,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,1.0,0.801097,0.347241,0.785462,1,0,1,1


In [None]:
feature_groups = ["home_team_history_is_play_home", "home_team_history_is_cup",
    "home_team_history_goal", "home_team_history_opponent_goal",
    "home_team_history_rating", "home_team_history_opponent_rating",
    "away_team_history_is_play_home", "away_team_history_is_cup",
    "away_team_history_goal", "away_team_history_opponent_goal",
    "away_team_history_rating", "away_team_history_opponent_rating",
    "home_team_history_match_DIFF_day", "away_team_history_match_DIFF_days",
    "home_team_history_DIFF_goal","away_team_history_DIFF_goal",
    "home_team_history_ELO_rating","away_team_history_ELO_rating",
    "home_away_team_history_ELO_rating",
    "home_team_history_SAME_coaX", "away_team_history_SAME_coaX",
    "home_team_history_SAME_leaG", "away_team_history_SAME_leaG",
    "home_team_result", "away_team_result",
    "home_winner", "home_loser", "away_winner", "away_loser"]
train_x_pivot = pd.wide_to_long(train_x, stubnames=feature_groups,
                i=['id','is_cup'], j='time', sep='_', suffix='\d+')
test_x_pivot = pd.wide_to_long(test_x, stubnames=feature_groups,
                i=['id','is_cup'], j='time', sep='_', suffix='\d+')
#
print(f"Train pivot shape: {train_x_pivot.shape}")
print(f"Test pivot shape: {test_x_pivot.shape}")

In [None]:
train_x_pivot = train_x_pivot.reset_index()
test_x_pivot = test_x_pivot.reset_index()
train_x_pivot=train_x_pivot.fillna({'is_cup':False})
train_x_pivot['is_cup'] = pd.get_dummies(train_x_pivot['is_cup'], drop_first=True)
test_x_pivot=test_x_pivot.fillna({'is_cup':False})
test_x_pivot['is_cup']= pd.get_dummies(test_x_pivot['is_cup'], drop_first=True)

In [None]:
train_x_pivot.head(20)

In [None]:
test_x_pivot.head(20)

In [None]:
def add_features_II(df):
    df['home_team_history_DIFF_goal_csum'] = df.groupby('id')['home_team_history_DIFF_goal'].cumsum()
    df['away_team_history_DIFF_goal_csum'] = df.groupby('id')['away_team_history_DIFF_goal'].cumsum()
    df['home_team_hist_rat_mean'] = df.groupby('id')['home_team_history_rating'].transform('mean')
    df['away_team_hist_rat_mean'] = df.groupby('id')['away_team_history_rating'].transform('mean')
    df['home_away_rat_elo'] = 1/(1+10**((df['away_team_hist_rat_mean']-df['home_team_hist_rat_mean'])/10))
    df['home_team_result_mean'] = df.groupby('id')['home_team_result'].transform('mean')
    df['away_team_result_mean'] = df.groupby('id')['away_team_result'].transform('mean')
    df['home_team_result_lag1'] = df.groupby('id')['home_team_result'].shift(-1)
    df['away_team_result_lag1'] = df.groupby('id')['away_team_result'].shift(-1)
    df['home_team_result_lag2'] = df.groupby('id')['home_team_result'].shift(-2)
    df['away_team_result_lag2'] = df.groupby('id')['away_team_result'].shift(-2)
    return df

In [None]:
# train_x_pivot.head(20)
# test_x_pivot.head(20)

In [None]:
INV = True
if INV:
    train_x_pivot.sort_values(by=['time'], inplace = True, ascending=False)
    train_x_pivot = pd.merge(train_x['id'], train_x_pivot, on="id").drop(['id', 'time'], axis = 1)
    test_x_pivot.sort_values(by=['time'], inplace = True, ascending=False)
    test_x_pivot = pd.merge(test_x['id'], test_x_pivot, on="id").drop(['id', 'time'], axis = 1)

In [None]:
x_train = train_x_pivot.copy()
x_test = test_x_pivot.copy()
fill_median = True
if fill_median:
    x_train = np.where(np.isnan(x_train), np.nanmedian(x_train, axis=0), x_train)
    x_test = np.where(np.isnan(x_test), np.nanmedian(x_test, axis=0), x_test)
RS = RobustScaler()
x_train = RS.fit_transform(x_train)
x_test = RS.transform(x_test)
x_train = x_train.reshape(-1, T_HIST, x_train.shape[-1])
x_test = x_test.reshape(-1, T_HIST, x_test.shape[-1])

if False:
    x_train = np.nan_to_num(x_train, nan=MASK)
    x_test = np.nan_to_num(x_test, nan=MASK)


In [None]:
print(f"Train array shape: {x_train.shape} \nTest array shape: {x_test.shape}")

In [None]:

encoder = LabelEncoder()
encoder.fit(train_y)
encoded_y = encoder.transform(train_y)
dummy_y = to_categorical(encoded_y)
print(encoded_y.shape)
print(dummy_y.shape)

In [None]:
print(encoded_y[:10,])
print(dummy_y[:10,])

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

if os.environ["CUDA_VISIBLE_DEVICES"].count(',') == 0:
    gpu_strategy = tf.distribute.get_strategy()
    print('single strategy')
else:
    gpu_strategy = tf.distribute.MirroredStrategy()
    print('multiple strategy')

In [None]:
def model_1():
    x_input = layers.Input(shape=x_train.shape[1:])
    x = layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(16, return_sequences=True))(x_input) #(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(8, return_sequences=True))(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Flatten()(x)
    output = layers.Dense(CLASS, activation='softmax')(x)
    model = Model(inputs=[x_input],outputs=[output])

    return model

In [None]:
CONV_WIDTH = 3

def model_2():
    x_input = layers.Input(shape=x_train.shape[1:])
    x = layers.Conv1D(256, activation='relu', kernel_size=(CONV_WIDTH))(x_input)
    x = layers.Dense(16, activation = 'relu')(x)
    x = layers.Flatten()(x)
    x = layers.Dropout(0.5)(x)
    output = layers.Dense(CLASS, activation='softmax')(x)
    model = Model(inputs=[x_input],outputs=[output])

    return model

In [None]:
model = model_2()
model.summary()

In [None]:
plot_model(
    model,
    to_file='Football_Prob_Model.png',
    show_shapes=True,
    show_layer_names=True
)

In [None]:
EPOCH = 200
BATCH_SIZE = 512
N_SPLITS = 5
SEED = 123
VERBOSE = 1
PATIENCE = EPOCH // 10

test_preds = []

with gpu_strategy.scope():
    kf = KFold(n_splits=N_SPLITS, random_state=SEED, shuffle=True)

    for fold, (train_idx, test_idx) in enumerate(kf.split(x_train, dummy_y)):
        print('-'*15, '>', f'Fold {fold+1}/{N_SPLITS}', '<', '-'*15)
        X_train, X_valid = x_train[train_idx], x_train[test_idx]
        Y_train, Y_valid = dummy_y[train_idx], dummy_y[test_idx]
        model = model_2()
        model.compile(optimizer="adam", loss="categorical_crossentropy",
                     metrics=["accuracy"])
        es = EarlyStopping(monitor='val_loss', patience=PATIENCE, verbose=0, mode='min',
                           restore_best_weights=True)
        lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=10, verbose=0)
        model.fit(X_train, Y_train,
                  validation_data=(X_valid, Y_valid),
                  epochs=EPOCH,
                  verbose=VERBOSE,
                  batch_size=BATCH_SIZE,
                  callbacks=[lr, es])
        y_true = Y_valid.squeeze()
        y_pred = model.predict(X_valid, batch_size=BATCH_SIZE).squeeze()
        score1 = log_loss(y_true, y_pred)
        print(f"Fold-{fold+1} | OOF LogLoss Score: {score1}")
        test_preds.append(model.predict(x_test).squeeze())

In [None]:
predictions = sum(test_preds)/N_SPLITS
submission = pd.DataFrame(predictions,columns=['away', 'draw', 'home'])
round_num = False
if round_num:
    submission = submission.round(2)
    submission['draw'] = 1 - (submission['home'] + submission['away'])
submission['id'] = test[['id']]
submission[['id', 'home', 'away', 'draw']].to_csv('submission.csv', index=False)

In [None]:
submission[['id', 'home', 'away', 'draw']].head()