In [1]:
import pandas as pd
import numpy as np
import json
# plotting
import matplotlib.pyplot as plt
# statistical fitting of models
import statsmodels.api as sm
import statsmodels.formula.api as smf
#opening data
import os
import pathlib
import warnings
#used for plots
from scipy import stats
from mplsoccer import PyPizza, FontManager

pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore')

In [2]:
# Specify the path to the folder containing the CSV file
folder_path = '/Users/pratikhotchandani/Downloads/Github/DataScience_Football/WyscoutData'

In [4]:
england_event_file = "/Users/pratikhotchandani/Downloads/Github/DataScience_Football/WyscoutData/events/events_England.json"
with open(england_event_file) as f:
    data = json.load(f)
    df_england_events = pd.DataFrame(data)


In [5]:
df_england_events

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85,177959171
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.946850,83,177959172
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82,177959173
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82,177959174
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85,177959175
...,...,...,...,...,...,...,...,...,...,...,...,...
643145,5,Ball out of the field,[],0,"[{'y': 32, 'x': 0}, {'y': 100, 'x': 100}]",2500098,Interruption,1623,2H,2796.732525,50,251596409
643146,3,Corner,"[{'id': 302}, {'id': 801}, {'id': 1801}]",70965,"[{'y': 100, 'x': 100}, {'y': 47, 'x': 88}]",2500098,Free Kick,1633,2H,2829.821084,30,251596232
643147,1,Air duel,"[{'id': 701}, {'id': 1802}]",7919,"[{'y': 53, 'x': 12}, {'y': 50, 'x': 14}]",2500098,Duel,1623,2H,2831.211419,10,251596410
643148,1,Air duel,"[{'id': 703}, {'id': 1801}]",8005,"[{'y': 47, 'x': 88}, {'y': 50, 'x': 86}]",2500098,Duel,1633,2H,2832.434399,10,251596234


### Here we are creating a Player Radar plot. 
#### We will have all the important characteristics that a scout looks for in a player
#### the position that we are targeting is striker / winger

### 1. Xg

#### There are 3 types of shots, headers, non-headers, and penatlies
#### for penalties, we will have Xg = 0.8 for premiere league, this is alrady calculated and set globally. 
#### For headers and non-header goals, we will calculate Xg, using logit probability function

In [10]:
### function to caluclate Xg

# df is dataframe and npxg is boolean value which tells us if we have take penalty goals in consideration or not
def calculatexG(df, npxg):
    shots = df.loc[df["eventName"] == "Shot"].copy()
    shots["X"] = shots.positions.apply(lambda cell: (100 - cell[0]['x']) * 105/100)
    shots["Y"] = shots.positions.apply(lambda cell: cell[0]['y'] * 68/100)
    shots["C"] = shots.positions.apply(lambda cell: abs(cell[0]['y'] - 50) * 68/100)
    
    ## calculate distance and angle
    shots["Distance"] = np.sqrt(shots["X"]**2 + shots["C"]**2)
    shots["Angle"] = np.where(np.arctan(7.32 * shots["X"] / (shots["X"]**2 + shots["C"]**2 - (7.32/2)**2)) > 0, np.arctan(7.32 * shots["X"] /(shots["X"]**2 + shots["C"]**2 - (7.32/2)**2)), np.arctan(7.32 * shots["X"] /(shots["X"]**2 + shots["C"]**2 - (7.32/2)**2)) + np.pi)
    
    ## finding all the shots that turned into goals
    shots["Goal"] = shots.tags.apply(lambda x: 1 if {'id':101} in x else 0).astype(object)
    
    ##finding all header goals
    header = shots.loc[shots.apply(lambda x:{'id':403} in x.tags, axis=1)]
    
    ##finding all non-header goals
    non_header = shots.drop(header.index)
    
    ## creating logistic regression model
    ### model of headers
    headers_model = smf.glm(formula = "Goal ~ Distance + Angle",data = header,
                           family = sm.families.Binomial()).fit()
    ### model of non-headers
    nonheaders_model = smf.glm(formula = "Goal ~ Distance + Angle",data = non_header,
                           family = sm.families.Binomial()).fit()
    
    ### calculating xG
    #### for headers
    b_head = headers_model.params
    xG = 1 / (1 + np.exp(b_head[0] + b_head[1]*header['Distance'] + b_head[2]*header['Angle']))
    header = header.assign(xG = xG)
    
    #### for non headers
    b_nhead = nonheaders_model.params
    xG = 1 / (1 + np.exp(b_nhead[0] + b_nhead[1]*non_header['Distance'] + b_nhead[2]*non_header['Angle']))
    non_header = non_header.assign(xG = xG)
    
    ## now we will calculate total xG
    ### adding if condition which for penalty goals
    
    if npxg == False:
        ## find penalties
        penalies = df.loc[df["subEventName"] == "Penalty"]
        ## assigning xG = 0.8
        penalties = penalties.assign(xG = 0.8)
        ## now group and sum all the xG
        all_shot_xg = pd.concat([non_header[["playerId","xG"]], header[["playerId","xG"]], penalties[["playerId","xG"]]])
        xG_sum = all_shots_xg.groupby(["playerID"])["npxG"].sum().sort_values(ascending = False).reset_index()
        
    else:
        all_shots_xg = pd.concat([non_header[["playerId", "xG"]], header[["playerId", "xG"]]])
        all_shots_xg.rename(columns = {"xG": "npxG"}, inplace = True)
        xG_sum = all_shots_xg.groupby(["playerId"])["npxG"].sum().sort_values(ascending = False).reset_index()
    
    return xG_sum

npxg = calculatexG(df_england_events,npxg=True)


In [11]:
npxg.head()

Unnamed: 0,playerId,npxG
0,8717,22.01418
1,120353,17.215819
2,11066,14.144484
3,7905,13.364998
4,8325,12.783379


## 2. Calculating passes and reception of passes in final third

In [13]:
## df["playerId"].shift(-1) retrieves the "playerId" column from the DataFrame and applies the shift() 
## function with a shift value of -1. The shift() function shifts the values in the column by the 
## specified number of periods. In this case, it shifts the values one position backward (-1), 
## effectively assigning the value of the next row's "playerId" to each row in the new "nextPlayerId" column.

In [16]:
def finalThird(df):
    df = df.copy()
    df["next_player_id"] = df["player_id"].shift(-1)
    passes = df.loc[df[]]

IndentationError: expected an indented block (321263369.py, line 2)

In [15]:
df_england_events

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85,177959171
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.946850,83,177959172
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82,177959173
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82,177959174
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85,177959175
...,...,...,...,...,...,...,...,...,...,...,...,...
643145,5,Ball out of the field,[],0,"[{'y': 32, 'x': 0}, {'y': 100, 'x': 100}]",2500098,Interruption,1623,2H,2796.732525,50,251596409
643146,3,Corner,"[{'id': 302}, {'id': 801}, {'id': 1801}]",70965,"[{'y': 100, 'x': 100}, {'y': 47, 'x': 88}]",2500098,Free Kick,1633,2H,2829.821084,30,251596232
643147,1,Air duel,"[{'id': 701}, {'id': 1802}]",7919,"[{'y': 53, 'x': 12}, {'y': 50, 'x': 14}]",2500098,Duel,1623,2H,2831.211419,10,251596410
643148,1,Air duel,"[{'id': 703}, {'id': 1801}]",8005,"[{'y': 47, 'x': 88}, {'y': 50, 'x': 86}]",2500098,Duel,1633,2H,2832.434399,10,251596234
