### run necessary files

In [107]:
%run ../src/utils_io.py
%run ../src/utils_viz.py
%run ../src/scaling.py
%run ../src/grid_search.py

### import necessary modules

In [24]:
import joblib
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import cm
from matplotlib import colors
import dataframe_image as dfi
import seaborn as sns
import statsmodels.api as sm
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn import ensemble

### load the data

In [25]:
log_train_df = pd.read_pickle('../input/basic_dataset/train_test_data_result/train_preds_log_regg.pkl')
log_test_df = pd.read_pickle('../input/basic_dataset/train_test_data_result/test_preds_log_regg.pkl')

rnd_train_df = pd.read_pickle('../input/basic_dataset/train_test_data_result/train_preds_random_forest.pkl')
rnd_test_df = pd.read_pickle('../input/basic_dataset/train_test_data_result/test_preds_random_forest.pkl')

xb_train_df = pd.read_pickle('../input/basic_dataset/train_test_data_result/train_preds_xg_boost.pkl')
xb_test_df = pd.read_pickle('../input/basic_dataset/train_test_data_result/test_preds_xg_boost.pkl')

### plot the values

In [36]:
title_1 = "Statsbomb xG value for train-set"
title_2 = "Statsbomb xG value for test-set"

xG_plot(log_train_df, "shot_statsbomb_xg", title_1, "../plots/real_values/train.jpg")
xG_plot(log_test_df, "shot_statsbomb_xg", title_2, "../plots/real_values/test.jpg")

plt.close("all")

In [37]:
title_1 = "Logistic Regression on train-set"
title_2 = "Random Forest on train-set"
title_3 = "xG Boost on train-set"

xG_plot(log_train_df, "pred_log_regg", title_1, "../plots/basic_dataset/log_train.jpg")
xG_plot(rnd_train_df, "pred_random_forest", title_2, "../plots/basic_dataset/rnd_train.jpg")
xG_plot(xb_train_df, "pred_xg_boost", title_3, "../plots/basic_dataset/xb_train.jpg")

plt.close("all")

In [38]:
title_1 = "Logistic Regression on test-set"
title_2 = "Random Forest on test-set"
title_3 = "xG Boost on test-set"

xG_plot(log_test_df, "pred_log_regg", title_1, "../plots/basic_dataset/log_test.jpg")
xG_plot(rnd_test_df, "pred_random_forest", title_2, "../plots/basic_dataset/rnd_test.jpg")
xG_plot(xb_test_df, "pred_xg_boost", title_3, "../plots/basic_dataset/xb_test.jpg")

plt.close("all")

### make dataframe image

In [109]:
first_df = make_df(
    log_train_df, 
    cols=["player_name", "target", "shot_statsbomb_xg", "pred_log_regg"]
)

second_df = make_df(
    rnd_train_df, 
    cols=["player_name", "target", "shot_statsbomb_xg", "pred_random_forest"]
)["pred_random_forest"]

third_df = make_df(
    xb_train_df, 
    cols=["player_name", "target", "shot_statsbomb_xg", "pred_xg_boost"]
)["pred_xg_boost"]

main_df = pd.concat([first_df, second_df, third_df], axis=1)

plot_dataframe(
    main_df,
    path="../plots/basic_dataset/train_simple.png"
)

In [110]:
first_df = make_df(
    log_test_df, 
    cols=["player_name", "target", "shot_statsbomb_xg", "pred_log_regg"]
)

second_df = make_df(
    rnd_test_df, 
    cols=["player_name", "target", "shot_statsbomb_xg", "pred_random_forest"]
)["pred_random_forest"]

third_df = make_df(
    xb_test_df, 
    cols=["player_name", "target", "shot_statsbomb_xg", "pred_xg_boost"]
)["pred_xg_boost"]

main_df = pd.concat([first_df, second_df, third_df], axis=1)

plot_dataframe(
    main_df,
    path="../plots/basic_dataset/test_simple.png"
)

### Indv 

In [99]:
one = log_train_df.loc[
    (log_train_df["shot_type_name"] == "Open Play") &
    (log_train_df["body_part"] == "Foot") 
].sample(n=5).reset_index(drop=True)

two = log_train_df.loc[
    (log_train_df["shot_type_name"] == "Open Play") &
    (log_train_df["body_part"] == "Head") 
].sample(n=5).reset_index(drop=True)

three = log_train_df.loc[
    (log_train_df["body_part"] == "Other") 
].sample(n=5).reset_index(drop=True)

four = log_train_df.loc[
    (log_train_df["shot_type_name"] == "Free Kick")
].sample(n=5).reset_index(drop=True)

df = pd.concat([one, two, three, four], axis=0).reset_index(drop=True)

df = df.loc[:, ["player_name", 'angle', 'distance', "shot_type_name", "body_part", "target",
               "shot_statsbomb_xg", "pred_log_regg"]]

In [108]:
plot_dataframe(
    df,
    path="../plots/basic_dataset/random_sample.png"
)

### P-value

In [111]:
## read in the dataset
train_df = pd.read_pickle("../input/basic_dataset/train_test_data_final/train_ohe_final.pkl")

## drop unnecessary columns
x_train = train_df.drop(["shot_statsbomb_xg", "player_name", "target"], axis=1)

## scale the values
scale = Scale(
    df = x_train,
    scale_type = "std",
    cols = ["angle", "distance", "distance_angle"]
)

x_train = scale.fit_transform()

## fetch target values for train and test dataframe
y_train = train_df['target'].values

## get summary
summary = get_stats(x_train, y_train)

print(summary)

Optimization terminated successfully.
         Current function value: 0.309483
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                14971
Model:                          Logit   Df Residuals:                    14965
Method:                           MLE   Df Model:                            5
Date:                Sat, 19 Sep 2020   Pseudo R-squ.:                  0.1527
Time:                        17:01:36   Log-Likelihood:                -4633.3
converged:                       True   LL-Null:                       -5468.2
Covariance Type:            nonrobust   LLR p-value:                     0.000
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
x0_Open Play      -1.2389      0.125     -9.890      0.000      -1.484      -0.993
x1_Foot          

### Log-Reg

In [113]:
## read in the dataset
train_df = pd.read_pickle("../input/basic_dataset/train_test_data_final/train_ohe_final.pkl")

## drop unnecessary columns
x_train = train_df.drop(["shot_statsbomb_xg", "player_name", "target"], axis=1)

## scale the values
scale = Scale(
    df = x_train,
    scale_type = "std",
    cols = ["angle", "distance", "distance_angle"]
)

x_train = scale.fit_transform()

## fetch target values for train and test dataframe
y_train = train_df['target'].values
x_train = x_train.values

params = tune_log_model(x_train, y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 377 out of 400 | elapsed:    7.6s remaining:    0.5s


ROC-AUC : 0.7759800319893241


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    7.9s finished


In [114]:
params

{'C': 0.3593813663804626, 'penalty': 'l2', 'solver': 'saga'}

### Random Forest

In [116]:
## read in the dataset
train_df = pd.read_pickle("../input/basic_dataset/train_test_data_final/train_label_final.pkl")

## drop unnecessary columns
x_train = train_df.drop(["shot_statsbomb_xg", "player_name", "target"], axis=1)

## fetch target values for train and test dataframe
y_train = train_df['target'].values
x_train = x_train.values

params = tune_random_forest(x_train, y_train)

Fitting 5 folds for each of 210 candidates, totalling 1050 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   59.2s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 1050 out of 1050 | elapsed: 10.0min finished


ROC-AUC : 0.7745215868786487


In [117]:
params

{'criterion': 'entropy',
 'max_depth': 5,
 'min_samples_split': 2,
 'n_estimators': 100}

### xG Boost

In [118]:
## read in the dataset
train_df = pd.read_pickle("../input/basic_dataset/train_test_data_final/train_label_final.pkl")

## drop unnecessary columns
x_train = train_df.drop(["shot_statsbomb_xg", "player_name", "target"], axis=1)

## fetch target values for train and test dataframe
y_train = train_df['target'].values
x_train = x_train.values

params = tune_xg_boost(x_train, y_train)

Fitting 5 folds for each of 125 candidates, totalling 625 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   47.0s
[Parallel(n_jobs=-1)]: Done 625 out of 625 | elapsed:  1.2min finished


ROC-AUC : 0.775137370519906


In [119]:
params

{'min_child_weight': 5,
 'max_depth': 4,
 'learning_rate': 0.05,
 'gamma': 0,
 'colsample_bytree': 0.7}