In [2]:
import pandas as pd
import numpy as np
import os
import json
import matplotlib.pyplot as plt
import matplotlib.patches as pch
import random
import time
import matplotlib.cm as cm
import bokeh

In [3]:
homedir = os.path.expanduser("~")

In [4]:
dataFolder = homedir+"\\Documents\\GitHub\\open-data\\data\\"
matchesFolder = dataFolder + "matches\\"
lineupsFolder = dataFolder + "lineups\\"
eventsFolder = dataFolder + "events\\"

In [5]:
def json_loads(filepath):
    with open(filepath, encoding="utf-8") as json_file:
        return json.load(json_file)

In [6]:
def df_json(filepath):
    json = json_loads(filepath)
    matchlist = []
    halftimes = [0]

    for line in json:
        #info = {"home_team": json[0]["team"]["name"], "away_team": json[1]["team"]["name"],
        #"match_id": int(os.path.split(eventsfilepath)[-1].split(".")[0])}
        info = {}
        for r in line.keys():
            if isinstance(line[r], dict):
                for s in line[r].keys():
                    if isinstance(line[r][s], dict):
                        for t in line[r][s].keys():
                            info["_".join([r, s, t])] = line[r][s][t]
                    else:
                        info["_".join([r, s])] = line[r][s]
            else:
                info[r] = line[r]
        
        if "index" in line.keys():
            if line["type"]["name"] == "Half End":
                halftimes.append(ttm(line["timestamp"]))
            ht = halftimes[::2]
            info["time"] = ttm(line["timestamp"]) + sum(ht[:(info["period"])])
        matchlist.append(info)

    return pd.DataFrame(matchlist)

In [7]:
def tts(timestamp):
    n = 0
    for i in range(3):
        n += float(timestamp.split(":")[i]) * (60 ** (2-i))
    return n

def ttm(timestamp):
    return tts(timestamp) / 60

In [8]:
comps = json_loads(dataFolder + "competitions.json")
competitions = pd.DataFrame(comps)

In [9]:
competitions

Unnamed: 0,competition_gender,competition_id,competition_name,country_name,match_available,match_updated,season_id,season_name
0,female,37,FA Women's Super League,England,2019-06-23T15:32:29.914,2019-06-23T15:32:29.914,4,2018/2019
1,male,43,FIFA World Cup,International,2019-06-23T12:38:35.142,2019-06-23T12:38:35.142,3,2018
2,male,11,La Liga,Spain,2019-07-29T20:44:30.861,2019-07-29T20:44:30.861,27,2015/2016
3,male,11,La Liga,Spain,2019-07-30T12:42:05.563,2019-07-30T12:42:05.563,26,2014/2015
4,male,11,La Liga,Spain,2019-07-24T19:44:48.866,2019-07-24T19:44:48.866,25,2013/2014
5,male,11,La Liga,Spain,2019-07-29T17:46:18.935,2019-07-29T17:46:18.935,24,2012/2013
6,male,11,La Liga,Spain,2019-07-11T16:55:27.568,2019-07-11T16:55:27.568,23,2011/2012
7,male,11,La Liga,Spain,2019-08-01T17:44:54.870,2019-08-01T17:44:54.870,22,2010/2011
8,male,11,La Liga,Spain,2019-07-11T07:44:14.533,2019-07-11T07:44:14.533,21,2009/2010
9,male,11,La Liga,Spain,2019-07-06T22:42:14.468,2019-07-06T22:42:14.468,41,2008/2009


In [10]:
match_list = []
for comp in comps:
    if comp["competition_name"] != "" and comp["season_name"] != "":
        fp = matchesFolder + "{}\\{}.json".format(comp["competition_id"], comp["season_id"])
        matches = df_json(fp)
        match_list.append(matches)
    
matches = pd.concat(match_list, sort = False)

In [40]:
def matchsummary(index, matchesdf):
    matchfilepath = os.path.join(eventsFolder, str(matchesdf.loc[index, "match_id"].values[0]) + ".json") 
    js = json_loads(matchfilepath)
    DataFrame = df_json(matchfilepath)
    DataFrame = DataFrame.loc[DataFrame.period < 5, :]
    print([n for n in df.columns if "type_name" in n and "shot" not in n])
    
    dic = {}
    
    for grouping in ["team_name", "player_name"]:
        df_list = []
        te = df.groupby(grouping)
        for category in [n for n in df.columns if "type_name" in n and "shot" not in n]:
            team1 = te[category].value_counts().unstack()
            team1.add_prefix
            df_list += [team1]

        aggFunc = {"shot_statsbomb_xg": np.sum, "id": "count"}
        team0 = df.groupby(["team_name", "shot_outcome_name", "shot_type_name"]).agg(aggFunc).unstack().unstack()
        team0.columns = ["_".join(a) for a in team0.columns.values]
        df_list += [team0]
        
        dic[grouping] = pd.concat(df_list, axis = 1, sort = True)
        
        listing = ["shot_", "pass_", "duel_"]
    for x in listing + [""]:
        x_name = x + "type_name"
        df1 = df[x_name].value_counts().unstack().add_prefix(x)

        df_list += [df1]
        if x != "":
            group2 = grouping + [x + "type_name"]
            d2 = DataFrame.groupby(group2)
            df2 = d2[x + "outcome_name"].value_counts().unstack().unstack()
            if x == "shot_":
                df3 = d2.shot_statsbomb_xg.sum().unstack().add_prefix("xg_")
                df_list += [df3]
            df2.columns = ["_".join([x[:-1], y[1], y[0]]) for y in df2.columns.values]
            df_list += [df2]

    for x in ["dribble_outcome", "foul_committed_card", "bad_behaviour_card"]:
        prfx = x.split("_")[0]
        if x + "_name" in df.sum().columns:
            df1 = df[x+"_name"].value_counts().unstack().add_prefix(prfx+"_")
            df_list += [df1]
    
    df1 = df.pressure_regains.sum()
    df_list += [df1]
    
    if subject == "player":
        df1 = df.apply(lambda x: x[x["type_name"]=="Pass"].id.values)
        df2 = df1.apply(lambda x: DataFrame.loc[DataFrame["shot_key_pass_id"].isin(x), "shot_statsbomb_xg"].agg(["sum","count"]))
        df3 = df1.apply(lambda x: DataFrame.loc[(DataFrame["shot_outcome_name"]=="Goal")&(DataFrame["shot_key_pass_id"].isin(x)), "shot_statsbomb_xg"].count())

        df_list += [df2, df3]

    DF = pd.concat(df_list, join="outer", sort = True, axis= 1)

    DF = DF.rename(columns = {"sum": "xa", "count": "key_pass_total", 0: "assist_total"})
    DF = DF.fillna(0)
    
    goals_cols = [n for n in DF.columns if "Goal" in n and "shot" in n] + ["Own Goal Against"]
    DF.loc[:, "goal_total"] = DF[goals_cols].sum(axis=1)
    xg_cols = [n for n in DF.columns if "xg" in n]
    DF.loc[:, "xg_total"] = DF.loc[:, xg_cols].sum(axis=1)
    DF["shot_total"] = DF["Shot"]
    
    for match in DataFrame.match_id.unique():
        mdf = DataFrame.loc[DataFrame.match_id == match, :]
        DF.loc[match, "minutes_max"] = mdf.time.max()
        x1 = mdf.possession.max()
        X = pd.DataFrame(index = mdf.possession_team_name.unique())
        X["possession_count"] = mdf.groupby("possession_team_name").possession.nunique()
        X["possession_percent"] = X["possession_count"]/x1
    
    DF["matches"] = 1
    
    if subject == "team":
        DF["opponent"] = [DF.index[n + (-1)**n][1] for n in range(len(DF))]
        
    else:
        DF["goal_total"] -= DF["Own Goal Against"]
        DF["minutes_end"] = DF["minutes_max"]
        DF["appearance"] = 1
        DF["start"] = 1
        DF["minutes_start"] = (DataFrame.groupby(["match_id", "team_name", "substitution_replacement_name"]).time.min())
        DF = DF.fillna(0)
        DF.loc[DF.minutes_start>0, "start"] = 0
        DF["minutes_played"] = DF["minutes_end"] - DF["minutes_start"]
    
    for i in ["shot", "goal", "xg"]:
        if i == "goal":
            sj = "shot_Penalty_Goal"
        else:
            sj = i + "_Penalty"
        DF["NP_" + i] = DF[i + "_total"] - DF[sj]
        
    DF.columns = [n.lower() for n in DF.columns]
    return dic

In [41]:
for i in range(len(matches)):
    x = matchsummary(2, matches)
    break

['duel_type_name', 'foul_committed_type_name', 'goalkeeper_type_name', 'pass_type_name', 'type_name']


In [35]:
x["player_name"].columns

Index(['Aerial Lost', 'Tackle', 'Dangerous Play', 'Handball', 'Collected',
       'Goal Conceded', 'Keeper Sweeper', 'Penalty Conceded', 'Shot Faced',
       'Shot Saved', 'Corner', 'Free Kick', 'Goal Kick', 'Interception',
       'Kick Off', 'Recovery', 'Throw-in', '50/50', 'Ball Receipt*',
       'Ball Recovery', 'Block', 'Carry', 'Clearance', 'Dispossessed',
       'Dribble', 'Dribbled Past', 'Duel', 'Foul Committed', 'Foul Won',
       'Goal Keeper', 'Interception', 'Miscontrol', 'Pass', 'Pressure',
       'Shield', 'Shot', 'Substitution', 'shot_statsbomb_xg_Free Kick_Blocked',
       'shot_statsbomb_xg_Free Kick_Goal', 'shot_statsbomb_xg_Free Kick_Off T',
       'shot_statsbomb_xg_Free Kick_Saved',
       'shot_statsbomb_xg_Open Play_Blocked',
       'shot_statsbomb_xg_Open Play_Goal', 'shot_statsbomb_xg_Open Play_Off T',
       'shot_statsbomb_xg_Open Play_Saved',
       'shot_statsbomb_xg_Penalty_Blocked', 'shot_statsbomb_xg_Penalty_Goal',
       'shot_statsbomb_xg_Penalty_Off T