In [40]:
import pandas as pd
import math
from IPython.display import display, HTML

In [120]:
def duration(row):
    return row.time_end - row.time_start

def team(player_id):
    if player_id in range(1, 15):
        return 1
    else:
        return 2
    
def is_turnover(new_features):
    return new_features['sender_team'] != new_features['receiver_team']

def distance(x1, y1, x2, y2):
    return ((x2 - x1) ** 2 + (y2 - y1) ** 2) ** .5

def player_distance(player_id_1, player_id_2, row):
    x1 = row['x_' + str(player_id_1)]
    y1 = row['y_' + str(player_id_1)]
    x2 = row['x_' + str(player_id_2)]
    y2 = row['y_' + str(player_id_2)]
    return distance(x1, y1, x2, y2)
    
def x_distance(x1, x2):
    return abs(x2 - x1)

def player_x_distance(player_id_1, player_id_2, row):
    x1 = row['x_' + str(player_id_1)]
    x2 = row['x_' + str(player_id_2)]
    return x_distance(x1, x2)

# (0, 0): center of the pitch;
# (-5250, -3400): top-left corner of the pitch;
# (-5250, 3400): bottom-left corner of the pitch;
# (5250, -3400): top-right corner of the pitch;
# (5250, 3400): bottom-right corner of the pitch.
def goal_distance(player_id, goal_direction, row):
    str_player_id = str(player_id)
    player_x = row['x_' + str_player_id]
    player_y = row['y_' + str_player_id]
    if goal_direction == 1:
        return distance(player_x, player_y, 5250, 0)
    else:
        return distance(player_x, player_y, -5250, 0)

def goal_x_distance(player_id, goal_direction, row):
    str_player_id = str(player_id)
    player_x = row['x_' + str_player_id]
    if goal_direction == 1:
        return x_distance(player_x, 5250)
    else:
        return x_distance(player_x, -5250)

# add features that don't depend on other rows
def add_independent_features(feature_names, row):
    
    # create dictionary of feature names and placeholder values
    new_features = dict(zip(feature_names, [None] * len(feature_names)))

    # sender
    new_features['sender_x'] = row['x_' + str(int(row.sender_id))]
    new_features['sender_y'] = row['y_' + str(int(row.sender_id))]
    new_features['sender_team'] = team(row.sender_id)
    
    # ex post facto
    new_features['receiver_team'] = team(row.receiver_id)
    new_features['is_turnover'] = is_turnover(new_features)
    new_features['duration'] = duration(row)
    new_features['distance'] = player_distance(int(row.sender_id), int(row.receiver_id), row)
    
    # team spacing stats
    team1_onfield = 0
    team1_onfield_ids = []
    team1_sum_x = 0
    team1_sum_y = 0
    team1_avg_x = None
    team1_avg_y = None
    team2_onfield = 0
    team2_onfield_ids = []
    team2_sum_x = 0
    team2_sum_y = 0
    team2_avg_x = None
    team2_avg_y = None
    min_x = 0
    min_x_id = None
    max_x = 0
    max_x_id = None
    for i in range(1, 29):
        x = row['x_' + str(i)]
        y = row['y_' + str(i)]
        if not math.isnan(x):
            x = int(x)
            y = int(y)
            if i in range(1, 15):
                team1_onfield += 1
                team1_onfield_ids.append(i)
                team1_sum_x += x
                team1_sum_y += y
            if i in range(15, 29):
                team2_onfield += 1
                team2_onfield_ids.append(i)
                team2_sum_x += x
                team2_sum_y += y
            if x < min_x:
                min_x = x
                min_x_id = i
            if x > max_x:
                max_x = x
                max_x_id = i
    team1_avg_x = team1_sum_x / team1_onfield
    team1_avg_y = team1_sum_y / team1_onfield
    team2_avg_x = team2_sum_x / team2_onfield
    team2_avg_y = team2_sum_y / team2_onfield
    new_features['team1_onfield'] = team1_onfield
    new_features['team1_sum_x'] = team1_sum_x
    new_features['team1_sum_y'] = team1_sum_y
    new_features['team1_avg_x'] = team1_avg_x
    new_features['team1_avg_y'] = team1_avg_y
    new_features['team2_onfield'] = team2_onfield
    new_features['team2_sum_x'] = team2_sum_x
    new_features['team2_sum_y'] = team2_sum_y
    new_features['team2_avg_x'] = team2_avg_x
    new_features['team2_avg_y'] = team2_avg_y
    new_features['min_x'] = min_x
    new_features['min_x_id'] = min_x_id
    new_features['min_x_team'] = team(min_x_id)
    new_features['max_x'] = max_x
    new_features['max_x_id'] = max_x_id
    new_features['max_x_team'] = team(max_x_id)

    defending_team = 1 if new_features['sender_team'] == 2 else 2
    goal_direction = 1
    if new_features['team' + str(new_features['sender_team']) + '_avg_x'] > new_features['team' + str(defending_team) + '_avg_x']:
        goal_direction = -1
    new_features['goal_direction'] = goal_direction

    # team spacing more specific
    attackers = team1_onfield
    attacker_ids = team1_onfield_ids
    attacker_sum_x = team1_sum_x
    attacker_sum_y = team1_sum_y
    attacker_avg_x = team1_avg_x
    attacker_avg_y = team1_avg_y
    defenders = team2_onfield
    defender_ids = team2_onfield_ids
    defender_sum_x = team2_sum_x
    defender_sum_y = team2_sum_y
    defender_avg_x = team2_avg_x
    defender_avg_y = team2_avg_y
    if new_features['sender_team'] == 2:
        attackers = team2_onfield
        attacker_ids = team2_onfield_ids
        attacker_sum_x = team2_sum_x
        attacker_sum_y = team2_sum_y
        attacker_avg_x = team2_avg_x
        attacker_avg_y = team2_avg_y
        defenders = team1_onfield
        defender_ids = team1_onfield_ids
        defender_sum_x = team1_sum_x
        defender_sum_y = team1_sum_y
        defender_avg_x = team1_avg_x
        defender_avg_y = team1_avg_y
    new_features['attackers'] = attackers
    new_features['attacker_sum_x'] = attacker_sum_x
    new_features['attacker_sum_y'] = attacker_sum_y
    new_features['attacker_avg_x'] = attacker_avg_x
    new_features['attacker_avg_y'] = attacker_avg_y
    new_features['defenders'] = defenders
    new_features['defender_sum_x'] = defender_sum_x
    new_features['defender_sum_y'] = defender_sum_y
    new_features['defender_avg_x'] = defender_avg_x
    new_features['defender_avg_y'] = defender_avg_y
        
    # simple geometric
    for i in range(1, 29):
        new_features['sender_dist_to_' + str(i)] = player_distance(int(row.sender_id), i, row)
        new_features['sender_x_dist_to_' + str(i)] = player_x_distance(int(row.sender_id), i ,row)
        new_features['goal_dist_to_' + str(i)] = goal_distance(i, goal_direction, row)
        new_features['goal_x_dist_to_' + str(i)] = goal_x_distance(i, goal_direction, row)
            
    # sender continued
    new_features['sender_dist_to_goal'] = new_features['goal_dist_to_' + str(int(row.sender_id))]
    new_features['sender_x_dist_to_goal'] = new_features['goal_x_dist_to_' + str(int(row.sender_id))]
    
    # attacker/defender
    sender_sum_defender_dist = 0
    for i in defender_ids:
        sender_sum_defender_dist += new_features['sender_dist_to_' + str(i)]
    new_features['sender_sum_defender_dist'] = sender_sum_defender_dist
    new_features['sender_avg_defender_dist'] = sender_sum_defender_dist / defenders

    return pd.Series(new_features)

In [126]:
data = pd.read_csv('passes.csv', sep = ',',header = 0)
data = data[0:4]

# start with features to help algorithm understand sender
new_feature_names = ['sender_x', 'sender_y', 'sender_team']

# add ex post facto features - features that cannot be used for prediction but help us understand the historical data
new_feature_names.extend(['receiver_team', 'is_turnover', 'duration', 'distance'])

# add team spacing statistics for inferring goal direction
new_feature_names.extend(['team1_onfield', 'team1_sum_x', 'team1_sum_y', 'team1_avg_x', 'team1_avg_y'])
new_feature_names.extend(['team2_onfield', 'team2_sum_x', 'team2_sum_y', 'team2_avg_x', 'team2_avg_y'])
new_feature_names.extend(['min_x', 'min_x_id', 'min_x_team', 'max_x', 'max_x_id', 'max_x_team'])

new_feature_names.append('goal_direction')

# reinterpret team spacing in soccer domain
new_feature_names.extend(['attackers', 'attacker_sum_x', 'attacker_sum_y', 'attacker_avg_x', 'attacker_avg_y'])
new_feature_names.extend([ 'defenders', 'defender_sum_x', 'defender_sum_y', 'defender_avg_x', 'defender_avg_y'])

# add simple geometric features - distance and horizontal (upfield/downfield) distance to other players and goal
for i in range(1, 29):
    new_feature_names.append('sender_dist_to_' + str(i))
    new_feature_names.append('sender_x_dist_to_' + str(i))
    new_feature_names.append('goal_dist_to_' + str(i))
    new_feature_names.append('goal_x_dist_to_' + str(i))

# add features to help algorithm understand sender
new_feature_names.extend(['sender_dist_to_goal', 'sender_x_dist_to_goal'])

# add attacker/defender spacing features
new_feature_names.extend(['sender_sum_defender_dist', 'sender_avg_defender_dist'])

data[new_feature_names] = data.apply(lambda row: add_independent_features(new_feature_names, row), axis = 1)

display(HTML(data[['sender_x', 'attacker_avg_x', 'defender_avg_x', 'sender_avg_defender_dist']].to_html()))
# TODO: write data

Unnamed: 0,sender_x,attacker_avg_x,defender_avg_x,sender_avg_defender_dist
0,100.0,-1276.363636,1210.0,1707.698232
1,4140.0,2764.545455,990.909091,3646.790444
2,-160.0,-146.363636,-1434.545455,2281.281578
3,-710.0,-1429.090909,-134.545455,1616.539374
