### run necessary files

In [None]:
%run ../src/utils_io.py
%run ../src/utils_viz.py
%run ../src/scaling.py
%run ../src/grid_search.py

### import necessary modules

In [None]:
import joblib
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import cm
from matplotlib import colors
import dataframe_image as dfi
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

### load the data

In [None]:
log_train_df = pd.read_pickle('../input/intermediate_dataset/train_test_data_result/train_preds_log_regg.pkl')
log_test_df = pd.read_pickle('../input/intermediate_dataset/train_test_data_result/test_preds_log_regg.pkl')

rnd_train_df = pd.read_pickle('../input/intermediate_dataset/train_test_data_result/train_preds_random_forest.pkl')
rnd_test_df = pd.read_pickle('../input/intermediate_dataset/train_test_data_result/test_preds_random_forest.pkl')

xb_train_df = pd.read_pickle('../input/intermediate_dataset/train_test_data_result/train_preds_xg_boost.pkl')
xb_test_df = pd.read_pickle('../input/intermediate_dataset/train_test_data_result/test_preds_xg_boost.pkl')

### plot the values

In [None]:
title_1 = "Statsbomb xG value for train-set"
title_2 = "Statsbomb xG value for test-set"

path = "../plots/real_values/"

name_1 = "train.jpg"
name_2 = "test.jpg"

if os.path.isdir(path) == False:
    os.mkdir(path)

xG_plot(log_train_df, "shot_statsbomb_xg", title_1, path + name_1)
xG_plot(log_test_df, "shot_statsbomb_xg", title_2, path + name_2)

plt.close("all")

In [None]:
title_1 = "Logistic Regression on train-set"
title_2 = "Random Forest on train-set"
title_3 = "xG Boost on train-set"

path_1 = "../plots/intermediate_model/"

name_1 = "log_train.jpg"
name_2 = "rnd_train.jpg"
name_3 = "xb_train.jpg"

if os.path.isdir(path_1) == False:
    os.mkdir(path_1)

xG_plot(log_train_df, "pred_log_regg", title_1, path_1 + name_1)
xG_plot(rnd_train_df, "pred_random_forest", title_2, path_1 + name_2)
xG_plot(xb_train_df, "pred_xg_boost", title_3, path_1 + name_3)

plt.close("all")

In [None]:
title_1 = "Logistic Regression on test-set"
title_2 = "Random Forest on test-set"
title_3 = "xG Boost on test-set"

path_1 = "../plots/intermediate_model/"

name_1 = "log_test.jpg"
name_2 = "rnd_test.jpg"
name_3 = "xb_test.jpg"

if os.path.isdir(path_1) == False:
    os.mkdir(path_1)

xG_plot(log_test_df, "pred_log_regg", title_1, path_1 + name_1)
xG_plot(rnd_test_df, "pred_random_forest", title_2, path_1 + name_2)
xG_plot(xb_test_df, "pred_xg_boost", title_3, path_1 + name_3)

plt.close("all")

### make dataframe image

In [None]:
first_df = make_df(
    log_train_df, 
    cols=["player_name", "target", "shot_statsbomb_xg", "pred_log_regg"]
)

second_df = make_df(
    rnd_train_df, 
    cols=["player_name", "target", "shot_statsbomb_xg", "pred_random_forest"]
)["pred_random_forest"]

third_df = make_df(
    xb_train_df, 
    cols=["player_name", "target", "shot_statsbomb_xg", "pred_xg_boost"]
)["pred_xg_boost"]

main_df = pd.concat([first_df, second_df, third_df], axis=1)

path_1 = "../plots/intermediate_model/"
name_1 = "train_simple.png"

if os.path.isdir(path_1) == False:
    os.mkdir(path_1) 

plot_dataframe(
    main_df,
    path=path_1 + name_1
)

In [None]:
first_df = make_df(
    log_test_df, 
    cols=["player_name", "target", "shot_statsbomb_xg", "pred_log_regg"]
)

second_df = make_df(
    rnd_test_df, 
    cols=["player_name", "target", "shot_statsbomb_xg", "pred_random_forest"]
)["pred_random_forest"]

third_df = make_df(
    xb_test_df, 
    cols=["player_name", "target", "shot_statsbomb_xg", "pred_xg_boost"]
)["pred_xg_boost"]

main_df = pd.concat([first_df, second_df, third_df], axis=1)


path_1 = "../plots/intermediate_model/"
name_1 = "test_simple.png"

if os.path.isdir(path_1) == False:
    os.mkdir(path_1) 

plot_dataframe(
    main_df,
    path=path_1 + name_1
)

### Indv 

In [None]:
plot_dataframe(
    df,
    path="../plots/intermediate_model/random_sample.png"
)

### P-value

In [None]:
## read in the dataset
train_df = pd.read_pickle("../input/intermediate_dataset/train_test_data_final/train_ohe_final.pkl")

## drop unnecessary columns
x_train = train_df.drop(["target"], axis=1)

## scale the values
scale = Scale(
    df = x_train,
    scale_type = "std",
    cols = ["angle", "distance"]
)

x_train = scale.fit_transform()

## fetch target values for train and test dataframe
y_train = train_df['target'].values

## get summary
lm = get_stats(x_train, y_train)

print(lm.summary())

## Feature Importance: Random Forest

In [None]:
## read in the dataset
train_df = pd.read_pickle("../input/intermediate_dataset/train_test_data_final/train_label_final.pkl")

## drop unnecessary columns
x_train = train_df.drop(["target"], axis=1)
y_train = train_df["target"]

## columns
col_names = x_train.columns

## model
model = RandomForestClassifier()

## fit the model
model.fit(x_train, y_train)

## feature importance
importances = model.feature_importances_
idsx = np.argsort(importances)
plt.title("Feature Importance")
plt.barh(range(len(idsx)), importances[idsx], align="center")
plt.yticks(range(len(idsx)), [col_names[i] for i in idsx])
plt.xlabel("Random Forest Feature Importance")
plt.show()

## Feature Importance: xGBoost

In [None]:
## read in the dataset
train_df = pd.read_pickle("../input/intermediate_dataset/train_test_data_final/train_label_final.pkl")

## drop unnecessary columns
x_train = train_df.drop(["target"], axis=1)
y_train = train_df["target"]

## columns
col_names = x_train.columns

## model
model = XGBClassifier()

## fit the model
model.fit(x_train, y_train)

## feature importance
importances = model.feature_importances_
idsx = np.argsort(importances)
plt.title("Feature Importance")
plt.barh(range(len(idsx)), importances[idsx], align="center")
plt.yticks(range(len(idsx)), [col_names[i] for i in idsx])
plt.xlabel("xGBoost Feature Importance")
plt.show()

### Log-Reg

In [None]:
## read in the dataset
train_df = pd.read_pickle("../input/intermediate_dataset/train_test_data_final/train_ohe_final.pkl")

## drop unnecessary columns
x_train = train_df.drop(["target"], axis=1)

## scale the values
scale = Scale(
    df = x_train,
    scale_type = "std",
    cols = ["angle", "distance"]
)

x_train = scale.fit_transform()

## fetch target values for train and test dataframe
y_train = train_df['target'].values
x_train = x_train.values

params = tune_log_model(x_train, y_train)

In [None]:
params

### Random Forest

In [None]:
## read in the dataset
train_df = pd.read_pickle("../input/intermediate_dataset/train_test_data_final/train_label_final.pkl")

## drop unnecessary columns
x_train = train_df.drop(["target"], axis=1)

## fetch target values for train and test dataframe
y_train = train_df['target'].values
x_train = x_train.values

params = tune_random_forest(x_train, y_train)

In [None]:
params

### xG Boost

In [None]:
## read in the dataset
train_df = pd.read_pickle("../input/intermediate_dataset/train_test_data_final/train_label_final.pkl")

## drop unnecessary columns
x_train = train_df.drop(["target"], axis=1)

## fetch target values for train and test dataframe
y_train = train_df['target'].values
x_train = x_train.values

params = tune_xg_boost(x_train, y_train)

In [None]:
params

## New

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
comp_df = get_competition(path="../input/Statsbomb/data/competitions.json")

In [None]:
comp_df

In [None]:
comp_id = 16
season_id = 1

match_df = get_matches(
    comp_id, season_id, 
    path=f"../input/Statsbomb/data/matches/{comp_id}/{season_id}.json"
)

In [None]:
event_df = make_event_df(
    match_id=18245,
    path=f"../input/Statsbomb/data/events/{18245}.json"
)

lineup_df = make_event_df(
    match_id=22912,
    path=f"../input/Statsbomb/data/events/{22912}.json"
)

In [None]:
event_df.loc[
    (event_df["type_name"] == "Pass"),
    "pass_type_name"
].value_counts()

In [None]:
event_df.loc[
    2528,
    "pass_cross"
]

In [None]:
df = pd.read_pickle("../input/intermediate_dataset/train_test_data/train_df.pkl")

In [None]:
df["shot_type_name"].unique(), df["assisted_via_type"].unique()

In [None]:
df