# Pressing events dataset creation

Here we automatize the whole process of finding pressing polygon, detecting the intensity of pressing and recording all the pressing events for every match.

The use of jupyter notebook is useful to test al the preprocessing functions on a small sample of data (1 match). For every match and every ball possession sequence we detect all the pressing polygons and record all the players participating in pressing actions.

The computation over all the skillcorner dataset takes approx. 5 hours. 

## Utility function

We test some utility function through this notebook, then we move it to an external script.

In [1]:
def preprocess_match_data(data_match):
    """
    Input: 
        skill corner match data (api/match endpoint)
        
    Output:
        trackable2team: dictionary with the team id of each trackable object
        player_ids: dictionary with wyscout_id of each trackable player
        team_ids: dictionary with team_id conversion, from skillcorner to wyscout
    """
    trackable2team = {}
    player_ids = {}
    


    for p in data_match['players']:
        trackable2team[p['trackable_object']] = p['team_id']
        player_ids[p['trackable_object']] = str(p['wyscout_id'])

    trackable2team[data_match['ball']['trackable_object']] = 'ball'
    player_ids[data_match['ball']['trackable_object']]= 'ball'
    
    
    
    
    team_ids = {}
    for team in [data_match['home_team']['id'],data_match['away_team']['id']]:
        resp = requests.get("https://skillcorner.com/api/teams/%s?token=1dae30768331827a3bc4&matching=wyscout"%team,
                         )

        team_data = resp.json()
        team_ids[team] = team_data['wyscout_id']

    team_ids['home team'] = team_ids[data_match['home_team']['id']] #useful for ball possession conversion
    team_ids['away team'] = team_ids[data_match['away_team']['id']]

    team_ids[data_match['ball']['trackable_object']] = 'ball'
    for obj,team in  trackable2team.items():
        if team in team_ids:
            trackable2team[obj] = team_ids[(team)] #team id conversion
        
    return trackable2team,player_ids,team_ids
  
    
def preprocess_tracking_data(data,trackable2team,player_ids,team_ids):
    """
    create list of frame data from skillcorner tracking data
    
    Input: 
    json from skillcorner endpoint
    
    Output:
    frames with format:
    {
    'player':, 
    'x':,
    'y':,
    'team':,
    'time':,
    'type':'position',
    'period' : ,
    'is_possession_team': ,
    'is_possession_player' : 
    
    
    }
    """
    frames = []
    for line in data:
        if len(line['data'])>0 and line['timestamp'] is not None:
            time = line['timestamp']
            period = line['period']
            if line['possession']['group'] is not None:
                possess_team = team_ids[line['possession']['group']]
                possess_player = line['possession']['trackable_object']
            else:
                possess_team = None
                possess_player = None

            for obj in line['data']:
                try:

                    frames.append({'player':player_ids[obj['trackable_object']], 'x':obj['x'],
                                  'y':obj['y'], 'team': trackable2team[obj['trackable_object']],
                                  'time':int(time.split(":")[0])*3600 +
                                   int(time.split(":")[1])*60 
                                   + int(time.split(":")[2].split(".")[0])
                                   +float("0."+time.split(":")[2].split(".")[1]),
                                  'type':'position',
                                   'period' : period,
                                  'is_possession_team': trackable2team[obj['trackable_object']] == possess_team if possess_team!= None else None,
                                  'is_possession_player' : obj['trackable_object'] == possess_player if possess_team!= None else None})
                except KeyError:
                    raise
                    
    return frames

def json_line(resp):
    
    """
    skillcorner tracking data is provided as strange jsonline format
    """
    import json
    data = []
    i=0
    for line in resp.text.split("\n"):
        try:

            line = json.loads(line)
            data.append(line)
            i+=1
        except:
            
            pass
    
    return data

def preprocess_possession_sequences(frames):
    """
    split tracking frames into sequences of frames where the same team is in possession
    
    Input:
    list of frames
    
    Output:
    list of sequences of frames
    """
    sequences = []
    poss_frames  = sorted([x for x in frames if x['is_possession_team']!=None],key = lambda x : x['time'])

    actual_possess = poss_frames[0]['team']

    poss = [poss_frames[0]]
    for f in poss_frames[1:]:
        if f['is_possession_team'] and f['team'] != actual_possess:
            sequences.append(poss)
            poss = [f]
            actual_possess = f['team']
        else:
            poss.append(f)
            
    return sequences


def create_pressing_dataset(sequences, data_match,team_ids):
    """
    dataset creation for pressing analysis
    
    Input: list of sequences of frames, skillcorner match_data, team_ids conversion
    
    Output:
    list of json in the format:
    'possession_id':,
    'time' : ,
    'duration' : ,
    'team' : ,
    'opponent',
    'match',
    'shape_intersection' : ,
    'dispossession_distance' : ,
    'goal_distance' : ,
    'ball_goal_speed' :  , 
    'pressing_polygon' :,
    'possession_polygon': ,
    'pressing_players' : ,
    'ball_position' :
    
    """
    
    from scipy.spatial import ConvexHull,distance
    from shapely.geometry import Polygon

    home = team_ids[data_match['home_team']['id']]
    away = team_ids[data_match['away_team']['id']]
    directions = data_match['home_team_side']

    #we record: pressing and possession shapes intersection areas and dispossession time
    dataset = []

    for p,seq in enumerate(sequences):
        frames_pressed = []
        seq_times = sorted((set([x['time'] for x in seq])))
        for i,time in enumerate(seq_times):

            #finding 3 player closer to the ball for possessing team

            frame = [x for x in seq if x['time'] == time]

            try:
                ball_position = [(x['x'], x['y']) for x in frame if x['team']=='ball'][0]

                period = [x['period'] for x in frame ][0]
                next_time = seq_times[i+3]
                next_frame = [x for x in seq if x['time'] == next_time]
                next_ball_position = [(x['x'], x['y']) for x in next_frame if x['team']=='ball'][0]

            except IndexError:
                continue
            except KeyError:
                print ([x for x in frame if x['team']=='ball'])
                continue

            #check if there is a player controlling the ball, otherwise pressing is not measurable

            if len([x for x in frame if x['is_possession_player']])>0:

                closest_opp = sorted([(x['x'],x['y'],x['team'],x['player']) for x in frame if not x['is_possession_team'] and
                                     x['team']!='ball'], 
                                        key = lambda x: distance.euclidean(ball_position,(x[0], x[1])))[:4]

                closest_owners = sorted([(x['x'],x['y'],x['team'],x['player']) for x in frame if x['is_possession_team'] and
                                     x['team']!='ball'], 
                                        key = lambda x: distance.euclidean(ball_position,(x[0], x[1])))[:4]

                pressing, possession = [],[]

                if len(closest_opp)>0 and len(closest_owners) > 0:

                    pressing_polygon = [closest_opp[i] for i in ConvexHull([(x[0], x[1]) for x in closest_opp]).vertices]
                    opponent = closest_opp[0][2]
                    press_team = opponent
                    pressing = [{'time':time, 'team':opponent, 'pressing_polygon':pressing_polygon}]
                    press_shape = Polygon([x[:2] for x in pressing_polygon])


                    poss_polygon = [closest_owners[i] for i in ConvexHull([(x[0], x[1]) for x in closest_owners]).vertices]
                    team = closest_owners[0][2]
                    owner = team
                    possession = [{'time':time, 'team': team, 'possession_polygon':poss_polygon}]
                    poss_shape = Polygon([x[:2] for x in poss_polygon])    

                    #print (press_shape.intersection(poss_shape).area / poss_shape.area)

                    #checking game directions to detect if the ball is in the last 16 meters
                    home_dir = directions[int(period)-1]

                    if home_dir == 'left_to_right':
                        goal_x = 52 if team==home else -52
                        
                    else:
                        goal_x = -52 if team==home else 52
                    dataset.append({
                        'possession_id':p,
                        'time' : time,
                        'duration' : seq_times[-1] - seq_times[0],
                        'team' : press_team,
                        'owner': owner,
                        'match' : data_match['wyscout_id'],
                        'shape_intersection' : press_shape.intersection(poss_shape).area / poss_shape.area,
                        'dispossession_distance' : (seq_times[-1] - time)/(seq_times[-1] - seq_times[0]),
                        'goal_distance' : abs(goal_x - ball_position[0]),
                        'ball_goal_speed' :  distance.euclidean(ball_position,[goal_x,0]) -distance.euclidean(next_ball_position,[goal_x,0]), 
                        'pressing_polygon' : list(press_shape.exterior.coords),
                        'possession_polygon': list(poss_shape.exterior.coords),
                        'pressing_players' : [x[3] for x in closest_opp][1:],
                        'closest_to_ball' : [x[3] for x in closest_opp][0],
                        'ball_position' : ball_position,
                        'own_goal_position': [goal_x,0]



                    })
                
    return dataset

## preprocessing loop

Iteration over all the matches, to finally aggregate the dataset covering the whole set of available games

In [2]:
# get matches

import requests


resp = requests.get("https://skillcorner.com/api/matches/?token=1dae30768331827a3bc4&competition=5&matching=wyscout",
                     )

data = resp.json()
matches_list = []

while data is not None:
    
    for match in data['results']:
        matches_list.append(match['id'])
    next_batch = data['next']
    if next_batch is None:
        data = None
    else:
        resp = requests.get(next_batch)
        data = resp.json()
    

print ("Matches available:",len(matches_list))


Matches available: 381


In [8]:
from tqdm import tqdm
import json

pressing_data = []

out = open("pressing_dataset_1.json","w")

for match in tqdm(matches_list[0:200]):
    ## check match quality according to skillcorner statements
    #print('entro nel for')
    resp = requests.get("https://skillcorner.com/api/match/%s/data_collection?token=1dae30768331827a3bc4&matching=wyscout"%match,
                         )
    #print('controllo qualità')
    try:
        quality = resp.json()
        if quality['tracking_extra_players_quality_index']<4:
            continue


        resp = requests.get("https://skillcorner.com/api/match/%s?token=1dae30768331827a3bc4&matching=wyscout"%match,
                         )
        data_match = resp.json()

        #print('preprocessing')
        trackable2team,player_ids,team_ids = preprocess_match_data(data_match)

        resp = requests.get("https://skillcorner.com/api/match/%s/tracking?token=1dae30768331827a3bc4&competition=5&matching=wyscout"%match,)

        data = json_line(resp)

        #print('frames')
        frames = preprocess_tracking_data(data,trackable2team,player_ids,team_ids)
        #print('sequences')
        sequences = preprocess_possession_sequences(frames)
        #print('dataset')
        dataset = create_pressing_dataset(sequences,data_match,team_ids)

        pressing_data+=dataset

        #print('Secondo for')
        for record in dataset:
            json_str = json.dumps(record)
            print (json_str, file = out)
    except:
        continue
    

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [5:30:28<00:00, 99.14s/it]


In [9]:
arr = []
arr.append(pressing_data[0]['duration'])

for i in range(1, len(pressing_data)):
    if pressing_data[i]['duration'] != pressing_data[i-1]['duration']:
        arr.append(pressing_data[i]['duration'])

In [10]:
import h5py

#Open file
file = h5py.File('Pressing.h5', 'w')
file.create_dataset('press', data=arr)

file.close()


In [11]:
arr

[16.6,
 16.099999999999998,
 1.3000000000000043,
 4.399999999999999,
 24.5,
 97.7,
 5.400000000000006,
 11.5,
 3.799999999999983,
 29.899999999999977,
 1.1000000000000227,
 6.400000000000006,
 2.799999999999983,
 77.69999999999999,
 0.9000000000000341,
 2.7000000000000455,
 105.69999999999999,
 9.800000000000011,
 1.8000000000000114,
 55.299999999999955,
 8.5,
 6.300000000000011,
 19.0,
 21.700000000000045,
 2.3999999999999773,
 2.699999999999932,
 60.0,
 39.10000000000002,
 75.20000000000005,
 43.200000000000045,
 7.2999999999999545,
 2.0,
 44.39999999999998,
 4.0,
 27.5,
 10.899999999999977,
 1.7000000000000455,
 4.7999999999999545,
 13.100000000000023,
 10.0,
 0.39999999999997726,
 41.59999999999991,
 14.300000000000068,
 8.199999999999932,
 21.300000000000068,
 17.300000000000068,
 1.3999999999999773,
 27.899999999999977,
 23.299999999999955,
 24.5,
 8.200000000000045,
 35.40000000000009,
 8.400000000000091,
 24.09999999999991,
 77.09999999999991,
 38.90000000000009,
 1.20000000000

# Dataset analysis

We write the dataset on json line file, in order to not overload memory. Every line
of the output file is a json document. We then need to parse every line of the output file, in order to read the dataset and create a pandas dataframe.

## Pressing events

Here we create events related to pressing in the same format as wyscout events

In [None]:
import json
import pandas as pd
from tqdm import tqdm
dataset = []
filtered = []
last_match = None
with open("pressing_dataset.json") as f:
    for line in tqdm(f):
    #print (line)
        
        record = json.loads(line)
        match = record['match']
        if last_match is None:
            last_match = match
        if last_match!=match:
            #new game
            pressing_df = pd.DataFrame(dataset)
            pressing_df["shape_rolling"] = pressing_df.groupby(['match','possession_id'])['shape_intersection'].transform(lambda x: x.rolling(10, 1).mean())
            filtered+=pressing_df[pressing_df.shape_rolling>0.5].to_dict(orient = "records")
            dataset = [record]
            last_match = match
        else: 
            dataset.append(record)
            


In [None]:
pressing_df = pd.DataFrame(filtered)


In [None]:
## creating a pressing event with the same format as wyscout event
## we create two event types: ball pressure and line pressure


events = []

for record in pressing_df.iterrows():
    
    
    #ball pressure event
    
    if record[1]['shape_rolling']>0.5:
        
    
        #traslation into wyscout coords
        ball_x = abs(record[1]["ball_position"][0]-record[1]["own_goal_position"][0])*100/104
        ball_y = abs(record[1]["ball_position"][1]-record[1]["own_goal_position"][1])*100/36

        doc = {'matchId':record[1]['match'],
              'teamId': record[1]['team'],
              'playerId' : record[1]['closest_to_ball'],
              'eventName' : "ball pressure",
              'positions' : [{'x': ball_x,
                             'y' : ball_y}],
               'x': ball_x, ## only for plotting
              'y' : ball_y, ##only for plotting,
               'eventSec' : record[1]['time']

              }
        events.append(doc)
        for player in record[1]['pressing_players']:
            doc = {'matchId':record[1]['match'],
              'teamId': record[1]['team'],
              'playerId' : player,
              'eventName' : "line pressure",
              'positions' : [{'x': ball_x,
                             'y' : ball_y}],
              'x': ball_x, ## only for plotting
              'y' : ball_y, ##only for plotting
              'eventSec' : record[1]['time']
              }
            
            events.append(doc)

In [None]:
events_df = pd.DataFrame(events)
del events

In [None]:
events_df.to_json("pressing_events_dataframe.json",orient = "records")

### utility function to fetch teamId/playerId from wyscout



In [None]:
import requests
from requests.auth import HTTPBasicAuth

token = ("20spk6u-46lafc488-e5qszql-3ef7q42y3j","4DUb1h;m3:q+kMcvoCB6WTW+KH=#LC")

token = ('qjpuw3t-yf3bvhgms-8d0gift-bm2fprvs4z','7*.w$Ivko$FpRsbumn0gF(0Kc5yMAI')

def get_data(url, token):
    result = requests.get(url,

            auth=HTTPBasicAuth(token[0].strip(), token[1].strip()))
    
    return json.loads(result.text)

def get_team_name(teamId):
    
    url = 'https://apirest.wyscout.com/v2/teams/%s' %teamId
    result = get_data(url,token)
    if 'error' in result:
        #print ("missing data for player",playerId)
        return teamId
    return result["name"]

def get_player_name(playerId):
    
    url = 'https://apirest.wyscout.com/v2/players/%s' %playerId
    result = get_data(url,token)
    if 'error' in result:
        #print ("missing data for player",playerId)
        return playerId
    return result["shortName"]


team_names = {}

for teamId in events_df.teamId.unique():
    team_names[teamId] = get_team_name(teamId)
    


player_names = {}
for playerId in tqdm(events_df.playerId.unique()):
    player_names[playerId] = get_player_name(playerId)


In [None]:
events_df['teamName'] = events_df["teamId"].apply(lambda x : team_names[x])
events_df['playerName'] = events_df["playerId"].apply(lambda x : player_names[x])

## Distribution of pressing events by team

In [None]:
events_df.head()

In [None]:
pressure_by_match = events_df.groupby(["matchId","teamName","eventName"]).agg({"playerId":"count", "x":"mean", "y":"mean"})

import plotly.express as px

fig = px.box(pressure_by_match.reset_index(),x = "teamName",y="playerId")

fig.show()

## distribution of pressing events by player

In [None]:
pressure_by_match = events_df[events_df.eventName == "ball pressure"].groupby(["matchId","playerName","eventName"]).agg({"playerId":"count", "x":"mean", "y":"mean"})

import plotly.express as px

pressing_sum = pressure_by_match.reset_index().groupby(["playerName","eventName"]).agg({"playerId":"sum", "x":"mean", "y":"mean"}).reset_index()
fig = px.histogram(
      pressing_sum,x="playerId")

fig.show()

In [None]:
## most pressing players

most_pressing = pressing_sum[pressing_sum.playerId>2500]['playerName'].unique()


In [None]:
pressing_sum[pressing_sum.playerId>2500].sort_values('playerId')

In [None]:
pressure_by_match = events_df[events_df.eventName == "ball pressure"].groupby(["matchId","playerName","eventName"]).agg({"playerId":"count", "x":"mean", "y":"mean"}).reset_index()

import plotly.express as px

fig = px.box(pressure_by_match[pressure_by_match.playerName.isin(most_pressing)],
             x = "playerName",y="playerId")

fig.show()

## pressing heatmap

In [None]:
import base64
import plotly.express as px

with open("../coach2vec/bnpitch.jpg", "rb") as image_file:
    encoded_string = base64.b64encode(image_file.read()).decode()
#add the prefix that plotly will want when using the string as source
encoded_image = "data:image/png;base64," + encoded_string

team = "Roma"

fig = px.density_contour(events_df[events_df.teamName==team], x="x", y="y",
                         title = "density of pressing events for %s"%team,
                          
                     
                        )
fig.update_traces(contours_coloring="fill", contours_showlabels = True,
                  opacity = 0.4,
                 )

"""fig = px.scatter(events_df[events_df.teamName==team], x="x", y="y",
                opacity = 0.05)"""

fig.update_layout(images=[dict(
                  source= encoded_image,
                  xref= "x",
                  yref= "y",
                  x= 0,
                  y= 0,
                  sizex= 100,
                  sizey= 100,
                  sizing= "stretch",
                  opacity= 0.9,
                  layer= "below")],
                 font=dict(
        
        size=11,
        color="black"
            ),
        template = "simple_white",
        )
fig.update_xaxes(range=[0, 100])
fig.update_yaxes(range=[0, 100])
fig['layout']['yaxis']['autorange'] = "reversed"
fig.show()

In [None]:
import base64
import plotly.express as px

with open("../coach2vec/bnpitch.jpg", "rb") as image_file:
    encoded_string = base64.b64encode(image_file.read()).decode()
#add the prefix that plotly will want when using the string as source
encoded_image = "data:image/png;base64," + encoded_string

player = "D. Kulusevski"

fig = px.density_contour(events_df[events_df.playerName==player], x="x", y="y",
                         title = "density of pressing events for %s"%player,
                          
                     
                        )
fig.update_traces(contours_coloring="fill", contours_showlabels = True,
                  opacity = 0.4,
                 )

"""fig = px.scatter(events_df[events_df.teamName==team], x="x", y="y",
                opacity = 0.05)"""

fig.update_layout(images=[dict(
                  source= encoded_image,
                  xref= "x",
                  yref= "y",
                  x= 0,
                  y= 0,
                  sizex= 100,
                  sizey= 100,
                  sizing= "stretch",
                  opacity= 0.9,
                  layer= "below")],
                 font=dict(
        
        size=11,
        color="black"
            ),
        template = "simple_white",
        )
fig.update_xaxes(range=[0, 100])
fig.update_yaxes(range=[0, 100])
fig['layout']['yaxis']['autorange'] = "reversed"
fig.show()

In [None]:
s = 0.2

for i in range(0,10):
    s*