# Player Position Imputation
Needed installs: Torch Geometric

In [2]:
import os
os.chdir("..")
import ImputationModel.feature_engineering as fe
import UtilFunctions.util_functions as util_functions
import ImputationModel.model_functions as mf
import pandas as pd
import matplotlib.pyplot as plt
import time
import itertools

import torch
from torch import nn
import numpy as np
import math
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn import preprocessing
from torch.utils.data import Dataset
from torch.utils.data import DataLoader 

import ImputationModel.Models.Time_LSTM_Module as TLSTM_module
import ImputationModel.Models.Time_LSTM as TLSTM
import ImputationModel.Models.GNN as GNN
import ImputationModel.Models.Agent_Imputer as AgentImputer
import ImputationModel.Models.Baselines as BL

fps = 30.

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  from pandas import MultiIndex, Int64Index


# Section 1: Loading in the data
Step 1: Load in the available games

Step 2: Get event & tracking dataframes for all games

Step 3: Construct dataframe using the feature set described in the paper

Step 4: Sort the DataFrame by event so we get all players for each event in blocks

Step 5: Normalize the data

Step 6: Get data sequences to pass into the model

### Step 1: Load in the available games

In [3]:
suwon_match_df = pd.read_csv('data/Suwon_FC/Suwon_games.csv')
suwon_match_df.head(3)

Unnamed: 0,id,date,home_id,away_id,home_score,away_score
0,26223,2021-02-27 16:30:00+09:00,4644,4220,1,1
1,26238,2021-03-07 16:30:00+09:00,316,4220,3,0
2,26334,2021-03-10 19:30:00+09:00,4220,328,0,0


### Step 2: Get Event and Tracking dataframes for all games

In [4]:
events_dfs = []
tracking_dfs = []
home_dfs = []
away_dfs = []
formation_dfs = []
num_games = 34
count=0

for i in range(1,num_games+1):
    events_df, tracking_df, home_df, away_df, formation_df = util_functions.get_suwon_dataframes('game'+str(i))
    events_df['id'] = range(count,count+len(events_df))
    count+=len(events_df)
    events_dfs.append(events_df)
    tracking_dfs.append(tracking_df)
    home_dfs.append(home_df)
    away_dfs.append(away_df)
    formation_dfs.append(formation_df)

### Step 3: Construct dataframe using the feature set described in the paper

In [None]:
whole_num_input = pd.DataFrame()
whole_cat_input = pd.DataFrame()
whole_input = pd.DataFrame()
whole_label = np.empty((0,2), float)
for i in range(0,num_games):
    num_input, cat_input, label,tdf,edf,input_data = fe.get_game_data(events_dfs[i],tracking_dfs[i],home_dfs[i],away_dfs[i],util_functions.get_goalkeepers(home_dfs[i],away_dfs[i]), formation_dfs[i])
    whole_input = whole_input.append(input_data)
    whole_label = np.append(whole_label,label,axis=0)

### Step 4: Sort the DataFrame by event so we get all players for each event in blocks

In [None]:
sorted_whole_input_df, sorted_labels = fe.custom_sort(whole_input, whole_label)
sorted_whole_input_df = sorted_whole_input_df.reset_index(drop=True)
sorted_labels = torch.tensor(sorted_labels)
sorted_whole_input_df.head(2)

### Optional Step - Load in pre-saved data

In [12]:
#sorted_whole_input_df = pd.read_csv('sorted_whole_input_df_time.csv')[:66000]
#sorted_labels = torch.tensor(np.array(pd.read_csv('swid_labels.csv')))[:66000]

### Step 5: Normalize the data

In [13]:
#Create folds (e.g. data to put into the test set) based on match ids
fold1=sorted_whole_input_df[sorted_whole_input_df['match_id'].isin([26223])].index
fold2=sorted_whole_input_df[sorted_whole_input_df['match_id'].isin([26441, 26386, 42921])].index

"""Folds used in paper: fold1=sorted_whole_input_df[sorted_whole_input_df['match_id'].isin([42926, 42937, 42945])].index
fold2=sorted_whole_input_df[sorted_whole_input_df['match_id'].isin([26441, 26386, 42921])].index
fold3=sorted_whole_input_df[sorted_whole_input_df['match_id'].isin([26423, 26428, 26433])].index
fold4=sorted_whole_input_df[sorted_whole_input_df['match_id'].isin([26223, 26238, 26334])].index
fold5=sorted_whole_input_df[sorted_whole_input_df['match_id'].isin([26248, 26255, 26257])].index"""

time_scaler = RobustScaler()
timestamps = torch.tensor(time_scaler.fit_transform(np.array(sorted_whole_input_df['time_since_last_pred']).reshape(-1,1))).reshape(-1)
sorted_wi_num = sorted_whole_input_df[['ballx','prev_player_x','next_player_x','bally','prev_player_y','next_player_y','av_player_x','av_player_y','time_since_last_pred','prev_player_time','next_player_time']]
sorted_wi_cat = sorted_whole_input_df[['position','event_type','team_on_ball','player_on_ball','goal_diff']]
input_data_normalized, label_data_normalized, scaler = fe.preprocess_data(sorted_wi_num, sorted_wi_cat, sorted_labels, fold1)

### Step 6: Create seqeuences from the data

In [14]:
X_ss, y_mm, ts, out_inds = fe.split_sequences(sorted_whole_input_df, input_data_normalized, label_data_normalized, timestamps, 2, 2)

# Section 2: Create Training/Testing Data

In [18]:
#Split data into train and test data
X_train, X_test, y_train, y_test, X_train_ts, X_test_ts = fe.get_train_test_split(sorted_whole_input_df, X_ss, y_mm, ts, fold1)

#Put into format of 22 event sequences for an event (representing each player) and put into data loaders
train_data = fe.series_data(X_train, y_train, X_train_ts, 66)
test_data = fe.series_data(X_test, y_test, X_test_ts, 66)
train_loader = DataLoader(train_data,shuffle=False,batch_size=512)
test_loader = DataLoader(test_data,shuffle=False,batch_size=512)

RuntimeError: shape '[899, 22, 5, 60]' is invalid for input of size 5043390

Create edge index for fully connected graph network

In [None]:
#Connects players to every other player in the graph neural network
t1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]
list_edges = list(itertools.product(t1, t1))
for l in list_edges:
    if l[0] == l[1]:
        list_edges.remove(l)
list_edges = [list(ele) for ele in list_edges]

# Section 3: Load and Run Models
Model Types:

1. Agent Imputer - AgentImputer.AgentImputer
2. Time-LSTM = TLSTM.Time_LSTM
3. GNN = GNN.GNN

### Step 1: Load Models

In [None]:
model = AgentImputer.AgentImputer(input_size = X_train[0].shape[1])
optimizer = torch.optim.AdamW(model.parameters(), lr=0.002)

### Step 2: Train the model

In [None]:
model, y_preds = mf.model_training(train_loader, X_train_ts, model, optimizer, list_edges, 10)

### Step 3: Make Model Predictions

In [None]:
actual_predictions, actual_test_results = mf.get_test_predictions(test_loader, model, y_test, list_edges, scaler)

# Section 4: Model Analysis

Function which calculates and shows the error in predictions for train or test data

In [None]:
def prediction_analytics(actual_predictions, actual_test_results, num_preds):
    actual_predictions = actual_predictions.reshape(num_preds,2)
    actual_test_results = actual_test_results.reshape(num_preds,2)
    plt.plot(actual_predictions)
    plt.plot(actual_test_results)
    plt.show()
    
    print("Dist: ", np.mean([np.linalg.norm(actual_predictions[i]-np.array(actual_test_results[i])) for i in range(len(actual_predictions))]))
    print("X Dist: ", np.mean(abs(actual_predictions[:,0] - np.array(actual_test_results[:,0]))))
    print("Y Dist: ", np.mean(abs(actual_predictions[:,1] - np.array(actual_test_results[:,1]))))
    return np.mean([np.linalg.norm(actual_predictions[i]-np.array(actual_test_results[i])) for i in range(len(actual_predictions))]),np.mean(abs(actual_predictions[:,0] - np.array(actual_test_results[:,0]))),np.mean(abs(actual_predictions[:,1] - np.array(actual_test_results[:,1])))

### Step 1: Show accuracy for test data

In [None]:
print("Test data Results: ")
prediction_analytics(actual_predictions, actual_test_results, y_test.shape[0])

### Step 2: Show accuracy for train data

In [None]:
train_preds = [t.tolist() for t in y_preds]
actual_train_preds = scaler.inverse_transform(np.array([item for sublist in train_preds for item in sublist]).reshape(y_train.shape[0],2))
print("Train data Results: ")
prediction_analytics(actual_train_preds,scaler.inverse_transform(y_train),y_train.shape[0])

# Section 5: Generate and run Baseline Models

### Get data setup

In [None]:
base_x_train,base_x_train_cat,base_x_test,base_x_test_cat,base_y_train,base_y_test,xg_cat_train,xg_cat_test = BL.get_baseline_data(sorted_whole_input_df, sorted_labels, fold1)

### Baseline 1 - Average Seen Location

In [None]:
print("Baseline 1 distance error: ", BL.baseline_1(base_x_test.copy(), base_y_test)[0])
print("Baseline 1 distance error Train: ", BL.baseline_1(base_x_train.copy(), base_y_train)[0])

#Load results for baseline 1 and stores prediction in dataframe
#Duplicate code for other baseline models if you are looking to save baseline results for other models
_,b1_results = BL.baseline_1(base_x_test.copy(),base_y_test)
b1_results[['act_x','act_y']] = base_y_test
b1_results = b1_results.reset_index(drop=True)
b1_results['dist'] = [np.linalg.norm(np.array(b1_results.loc[i][['pred_x','pred_y']])-base_y_test[i]) for i in range(len(base_y_test))]
b1_results

### Baseline 2 - Time-Scaled Average Seen Location

In [None]:
print("Baseline 2 distance error: ", BL.baseline_2(base_x_test.copy(), base_y_test)[1])
print("Baseline 2 distance error train: ", BL.baseline_2(base_x_train.copy(), base_y_train)[1])

### Baseline 3 - Average Player Location Over a Game

In [None]:
b3_preds = np.array(base_x_test[['av_player_x','av_player_y']])
print("X error: ", np.mean(abs(b3_preds[:,0].reshape(len(b3_preds),1)-base_y_test[:,0].reshape(len(b3_preds),1))))
print("Y error: ", np.mean(abs(b3_preds[:,1].reshape(len(b3_preds),1)-base_y_test[:,1].reshape(len(b3_preds),1))))
print("Distance error: ", np.mean([np.linalg.norm(b3_preds[i]-base_y_test[i]) for i in range(len(b3_preds))]))

b3_preds_train = np.array(base_x_train[['av_player_x','av_player_y']])
print("X error train: ", np.mean(abs(b3_preds_train[:,0].reshape(len(b3_preds_train),1)-base_y_train[:,0].reshape(len(b3_preds_train),1))))
print("Y error train: ", np.mean(abs(b3_preds_train[:,1].reshape(len(b3_preds_train),1)-base_y_train[:,1].reshape(len(b3_preds_train),1))))
print("Distance error train: ", np.mean([np.linalg.norm(b3_preds_train[i]-base_y_train[i]) for i in range(len(b3_preds_train))]))

### Baseline 4 - XGBoost Baseline

In [None]:
xg_results, xg_preds = BL.xg_boost_baseline(base_x_train, xg_cat_train, base_x_test, xg_cat_test, base_y_train, base_y_test)
print("XGBoost Regressor Distance Error: ", xg_results)

##### View the XGboost predictions

In [None]:
xgboost_df = base_x_test
xgboost_df[['pred_x','pred_y']] = xg_preds
xgboost_df[['act_x','act_y']] = base_y_test
xgboost_df['dist'] = [math.dist(xgboost_df.iloc[i][['pred_x','pred_y']],xgboost_df.iloc[i][['act_x','act_y']]) for i in range(len(xgboost_df))]

# Storing the results
To view the results of these models and use them to perform downstream tasks you must: Store the results of these in a csv dataframe, including the predicted and actual locations of players along with the original dataset. This then must be stored in a file available for access by the Experiments notebook.

Example:

In [None]:
xgboost_df.head(2)
#xgboost_df.to_csv('ModelResults/xgboost_preds_time.csv')