In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import json
from math import *

import statsmodels.api as sms

# Read json file, transform it in a DataFrame

In [94]:
data = pd.read_json('../raw_data/0021500492.json')

In [95]:
d = {}

d['action'] = []
d['frame'] = []
d['quarter'] = []

d['ball_x'] = []
d['ball_y'] = []
d['ball_z'] = []

for i in range(1, 6):
    d[f'player_h{i}x'] = []
    d[f'player_h{i}y'] = []
    d[f'player_h{i}_jersey'] = []
    d[f'player_h{i}_name'] = []
    d[f'player_a{i}x'] = []
    d[f'player_a{i}y'] = []
    d[f'player_a{i}_jersey'] = []
    d[f'player_a{i}_name'] = []

for l in range(len(data['events'])):
    event = data['events'].iloc[l]
    for moment in range(len(event['moments'])):
        if len(event['moments'][moment][5]) == 11: # keep only the moments with all the information about the 10 players and the ball
            d['action'].append(l)
            d['frame'].append(moment)
            d['quarter'].append(event['moments'][moment][0])
            d['ball_x'].append(event['moments'][moment][5][0][2])
            d['ball_y'].append(event['moments'][moment][5][0][3])
            d['ball_z'].append(event['moments'][moment][5][0][4])
            for i in range(1, 6):
                d[f'player_h{i}x'].append(event['moments'][moment][5][i][2])
                d[f'player_h{i}y'].append(event['moments'][moment][5][i][3])
                d[f'player_a{i}x'].append(event['moments'][moment][5][i+5][2])
                d[f'player_a{i}y'].append(event['moments'][moment][5][i+5][3])
                home_player_id = event['moments'][moment][5][i][1]
                away_player_id = event['moments'][moment][5][i+5][1]
                for k in range(len(event['home']['players'])):
                    if event['home']['players'][k]['playerid'] == home_player_id:
                        d[f'player_h{i}_jersey'].append(event['home']['players'][k]['jersey'])
                        d[f'player_h{i}_name'].append(event['home']['players'][k]['firstname'] + ' ' + event['home']['players'][k]['lastname'])
                    if event['visitor']['players'][k]['playerid'] == away_player_id:
                        d[f'player_a{i}_jersey'].append(event['visitor']['players'][k]['jersey'])
                        d[f'player_a{i}_name'].append(event['visitor']['players'][k]['firstname'] + ' ' + event['visitor']['players'][k]['lastname'])

df = pd.DataFrame(d)

# Determine which team is in attack

In [96]:
# Finding which team attacks allows to find the ball carrier and closest defender, and to compute the right probabilities at the right time

# designate the team in which the player the closest to the ball is
def attack_team(dataframe_row):
    ball = [dataframe_row['ball_x'],dataframe_row['ball_y']]
    index = dataframe_row.index[0]
    home_team = [[dataframe_row['player_h' + str(i+1) +'x'],dataframe_row['player_h' + str(i+1) +'y']] for i in range(5)]
    away_team = [[dataframe_row['player_a' + str(i+1) +'x'],dataframe_row['player_a' + str(i+1) +'y']] for i in range(5)]
    distance_to_ball = [(home_team[i][0]-ball[0])*(home_team[i][0]-ball[0])+(home_team[i][1]-ball[1])*(home_team[i][1]-ball[1]) for i in range(5)]
    distance_to_ball += [(away_team[i][0]-ball[0])*(away_team[i][0]-ball[0])+(away_team[i][1]-ball[1])*(away_team[i][1]-ball[1]) for i in range(5)]
    min_index = distance_to_ball.index(min(distance_to_ball))+1
    if min_index <= 5:
        return 0 #represents the home team
    else:
        return 1 #represents the away team

# correct the previous feature so that a team is possessing the ball only if
# the players closest to the ball are in the same team for at least 10 frames
def correct_attack_team(dataframe):
    dataframe['diff_attack_team'] = dataframe['attack_team'].diff()
    diff_list = dataframe[dataframe['diff_attack_team']!=0].index.values.tolist()
    for i in range(len(diff_list)-1):
        if diff_list[i+1] <= diff_list[i] + 20:
            dataframe['attack_team'][diff_list[i]:diff_list[i+1]+1] = dataframe['attack_team'][diff_list[i]-1]
    df.drop(columns=['diff_attack_team'], inplace = True)

df["attack_team"] = df.apply(lambda x: attack_team(x), axis=1)
correct_attack_team(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Features Engineering

In [97]:
# compute the ball carrier in the home team by designing the player closest to the ball
def ball_carrier(dataframe_row):
    ball = [dataframe_row['ball_x'],dataframe_row['ball_y']]
    if dataframe_row['attack_team'] == 0: # home team attacks
        team = [[dataframe_row['player_h' + str(i+1) +'x'],dataframe_row['player_h' + str(i+1) +'y']] for i in range(5)]
    else: # away team attacks
        team = [[dataframe_row['player_a' + str(i+1) +'x'],dataframe_row['player_a' + str(i+1) +'y']] for i in range(5)]
    distance_to_ball = [(team[i][0]-ball[0])*(team[i][0]-ball[0])+(team[i][1]-ball[1])*(team[i][1]-ball[1]) for i in range(5)]
    return distance_to_ball.index(min(distance_to_ball))+1


# correct the ball carrier in order to designate a ball carrier only is he is the closest to the ball for at least 5 consecutive frames
def correct_change_carrier(dataframe):
    dataframe['diff_carry'] = dataframe['ball_carrier'].diff()
    diff_list = dataframe[dataframe['diff_carry']!=0].index.values.tolist()
    for i in range(len(diff_list)-1):
        if diff_list[i+1] <= diff_list[i] + 5:
            dataframe['ball_carrier'][diff_list[i]:diff_list[i+1]+1] = dataframe['ball_carrier'][diff_list[i]-1]

# compute the distance between the ball carrier and the basket
def distance_from_basket(dataframe_row):
    ball_carrier = dataframe_row['ball_carrier']
    if dataframe_row['quarter'] <= 2: # home attacks on the left, away attacks on the right
        home_attack_basket = [6.0,25.0]
        away_attack_basket = [94.0, 25.0]
    else: # inverted situation
        home_attack_basket = [94.0, 25.0]
        away_attack_basket = [6.0,25.0]
    if dataframe_row['attack_team'] == 0: # home team attacks
        basket = home_attack_basket
        baller = [dataframe_row['player_h'+str(int(ball_carrier))+'x'],dataframe_row['player_h'+str(int(ball_carrier))+'y']]
    else: # away team attacks
        basket = away_attack_basket
        baller = [dataframe_row['player_a'+str(int(ball_carrier))+'x'],dataframe_row['player_a'+str(int(ball_carrier))+'y']]
    distance= [baller[0]-basket[0],baller[1]-basket[1]]
    distance[0], distance[1] = distance[0]*distance[0], distance[1]*distance[1]
    return sqrt(sum(distance))

# designate the closest defender to the ball carrier, and compute its distance to the ball carrier
def closest_def(dataframe_row):
    ball_carrier = dataframe_row['ball_carrier']
    if dataframe_row['attack_team'] == 0: # home team attacks
        baller = [dataframe_row['player_h'+str(int(ball_carrier))+'x'],dataframe_row['player_h'+str(int(ball_carrier))+'y']]
        defenders = [[dataframe_row['player_a' + str(i+1) +'x'],dataframe_row['player_a' + str(i+1) +'y']] for i in range(5)]
    else: # away team attacks
        baller = [dataframe_row['player_a'+str(int(ball_carrier))+'x'],dataframe_row['player_a'+str(int(ball_carrier))+'y']]
        defenders = [[dataframe_row['player_h' + str(i+1) +'x'],dataframe_row['player_h' + str(i+1) +'y']] for i in range(5)]
    distance_to_def = [(defenders[i][0]-baller[0])*(defenders[i][0]-baller[0])+(defenders[i][1]-baller[1])*(defenders[i][1]-baller[1]) for i in range(5)]
    return sqrt(min(distance_to_def)), distance_to_def.index(min(distance_to_def))+1
   
# add these features to the dataset
df["ball_carrier"] = df.apply(lambda x: ball_carrier(x), axis=1)
correct_change_carrier(df)
df["dist_from_basket"] = df.apply(lambda x: distance_from_basket(x), axis=1)
df["closest_def"] = df.apply(lambda x: closest_def(x)[1], axis=1)
df["dist_to_closest_def"] = df.apply(lambda x: closest_def(x)[0], axis=1)

# add a feature that labels each frame where a pass occurs
df['diff_carry'] = df['ball_carrier'].diff()
df['diff_action'] = df['action'].diff()
df['diff_attack_team'] = df['attack_team'].diff()
df['pass'] = df.apply(lambda x: x['diff_carry']!=0 and x['diff_action']==0 and x['diff_attack_team']==0, axis=1)
df = df.drop(['diff_carry', 'diff_action', 'diff_attack_team'],axis=1)
df['pass'] = df['pass'].astype(int)

# first remove all the plays where at a certain point (dist_to_closest_def == 0 or dist_from_basket == 0), to avoid problems with the log
df = df[~df.action.isin(pd.unique(df[df.dist_to_closest_def == 0]['action']))]
df = df[~df.action.isin(pd.unique(df[df.dist_from_basket == 0]['action']))]

# engineer features from the previous ones
df["dist_from_basket^2/100"] = df["dist_from_basket"].apply(lambda x: x**2/100)
df["log_dist_from_basket"] = df["dist_from_basket"].apply(lambda x: log(x))
df["dist_to_closest_def^2/100"] = df["dist_to_closest_def"].apply(lambda x: x**2/100)
df["log_dist_to_closest_def"] = df["dist_to_closest_def"].apply(lambda x: log(x))
df.reset_index(inplace = True, drop = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Shooting success probability calculus

In [98]:
# compute the shooting success probability for everyframe:
# --> what is the probability that the ball carrier scores if the shoots
res = sms.load('model/logit.pkl')
model_features = features = ['dist_from_basket^2/100', 'dist_to_closest_def', 'dist_from_basket', 'dist_to_closest_def^2/100', 'log_dist_from_basket']
X = df[model_features]

df['shoot_prob'] = res.predict(X)

In [99]:
df.to_csv('../modified_data/gamedataset.csv')

In [100]:
df.describe()

Unnamed: 0,action,frame,quarter,ball_x,ball_y,ball_z,player_h1x,player_h1y,player_a1x,player_a1y,...,ball_carrier,dist_from_basket,closest_def,dist_to_closest_def,pass,dist_from_basket^2/100,log_dist_from_basket,dist_to_closest_def^2/100,log_dist_to_closest_def,shoot_prob
count,186801.0,186801.0,186801.0,186801.0,186801.0,186801.0,186801.0,186801.0,186801.0,186801.0,...,186801.0,186801.0,186801.0,186801.0,186801.0,186801.0,186801.0,186801.0,186801.0,186801.0
mean,220.802067,291.871912,2.522942,47.358557,25.399769,4.579437,45.715028,23.742144,45.766857,23.932206,...,2.885547,43.474839,2.965846,9.68469,0.014176,25.687018,3.5485,1.691277,1.933145,0.2873972
std,133.899671,198.432891,1.122793,28.147167,11.19508,3.072163,29.058637,11.464594,29.063667,11.082611,...,1.315695,26.050793,1.362216,8.679568,0.118214,26.721039,0.745377,3.601584,0.841124,0.1349756
min,0.0,0.0,1.0,-5.66251,-2.78478,0.00345,-4.86054,-2.00512,-3.38922,-0.76101,...,1.0,0.092744,1.0,0.053747,0.0,8.6e-05,-2.377907,2.9e-05,-2.923466,4.702835e-10
25%,112.0,127.0,2.0,23.62897,18.04965,2.70125,19.157,15.84932,17.79436,15.71989,...,2.0,23.98832,2.0,3.891288,0.0,5.754395,3.177567,0.151421,1.35874,0.2017825
50%,220.0,266.0,3.0,46.81328,25.21094,3.8047,40.8287,23.77476,44.44084,23.64702,...,3.0,34.136783,3.0,7.051042,0.0,11.6532,3.530375,0.497172,1.953175,0.2864452
75%,339.0,428.0,4.0,72.11175,33.16772,5.42627,74.04552,32.01333,73.69493,32.15061,...,4.0,68.944125,4.0,12.004671,0.0,47.532923,4.233296,1.441121,2.485296,0.3670535
max,450.0,1149.0,4.0,97.1689,53.07359,18.07355,97.39797,52.40578,96.73629,51.82644,...,5.0,97.773283,5.0,68.121733,1.0,95.596149,4.582651,46.405705,4.221296,0.7453354
