In [1]:
import pandas as pd
import numpy as np
import os
import json
import matplotlib.pyplot as plt
import matplotlib.patches as pch
import random
import time
import matplotlib.cm as cm
import bokeh

In [2]:
homedir = os.path.expanduser("~")

In [3]:
dataFolder = homedir+"\\Documents\\GitHub\\open-data\\data\\"
matchesFolder = dataFolder + "matches\\"
lineupsFolder = dataFolder + "lineups\\"
eventsFolder = dataFolder + "events\\"

In [4]:
def json_loads(filepath):
    with open(filepath, encoding="utf-8") as json_file:
        n = json.load(json_file)
    return n

In [5]:
def tts(timestamp):
    n = 0
    for i in range(3):
        n += float(timestamp.split(":")[i]) * (60 ** (2-i))
    return n

def ttm(timestamp):
    n = tts(timestamp) / 60
    return n

In [6]:
def graph(figdim):
    fig, ax = plt.subplots(figsize=figdim, dpi = 150)
    ax.spines["top"].set_visible(False)
    ax.spines["bottom"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_visible(False)

    return fig, ax, plt

In [7]:
comps = json_loads(dataFolder + "competitions.json")
competitions = pd.DataFrame(comps)

In [8]:
matchteam = ["match_id", "team_name"]
matchteamplayer = matchteam + ["player_name"]
playerteam = ["player_name", "team_name"]

competition = ["competition_competition_name"]
season = ["season_season_name"]

In [9]:
def df_json(filepath):
    json = json_loads(filepath)
    matchlist = []
    halftimes = [0]

    for line in json:
        #info = {"home_team": json[0]["team"]["name"], "away_team": json[1]["team"]["name"],
        #"match_id": int(os.path.split(eventsfilepath)[-1].split(".")[0])}
        info = {}
        for r in line.keys():
            if isinstance(line[r], dict):
                for s in line[r].keys():
                    if isinstance(line[r][s], dict):
                        for t in line[r][s].keys():
                            info["_".join([r, s, t])] = line[r][s][t]
                    else:
                        info["_".join([r, s])] = line[r][s]
            else:
                info[r] = line[r]
        
        if "index" in line.keys():
            if line["type"]["name"] == "Half End":
                halftimes.append(ttm(line["timestamp"]))
            ht = halftimes[::2]
            info["time"] = ttm(line["timestamp"]) + sum(ht[:(info["period"])])
        matchlist.append(info)

    n = pd.DataFrame(matchlist)  
    
    return n

In [10]:
match_list = []
for comp in comps:
    if comp["competition_name"] != "" and comp["season_name"] != "":
        fp = matchesFolder + "{}\\{}.json".format(comp["competition_id"], comp["season_id"])
        matches = df_json(fp)
        match_list.append(matches)
    
matches = pd.concat(match_list, sort = False)

In [11]:
events_list = []
for i, row in matches.iterrows():
    fp = eventsFolder + "{}.json".format(row.match_id)
    events = df_json(fp)
    df2 = pd.DataFrame(index = range(len(events)), data = [row.values.tolist()] * len(events), columns = row.index)
    events = pd.concat([events, df2], axis = 1)
    events_list.append(events)

In [12]:
DF = pd.concat(events_list, sort = False)

In [16]:
def TeamTimesDF(DataFrame):
    matchteam = ["match_id", "team_name"]
    competition = ["competition_competition_name"]
    season = ["season_season_name"]
    
    DF = DataFrame.groupby(matchteam+competition +season).time.max().reset_index()
    return DF

In [17]:
def PlayerTimesDF(DataFrame):
    matchteam = ["match_id", "team_name"]
    matchteamplayer = matchteam + ["player_name"]
    playerteam = ["player_name", "team_name"]

    competition = ["competition_competition_name"]
    season = ["season_season_name"]
    
    a = matchteamplayer+competition+season
    b = matchteam + competition + ["substitution_replacement_name"] + season
    
    holdingnamedf = DataFrame.groupby(a).time.max()
    timesdf = pd.DataFrame(index = holdingnamedf.index, data = 0, columns = ["minute_played"]).reset_index()
    subdf = DataFrame.groupby(b).time.min().reset_index().rename(columns = {"substitution_replacement_name":"player_name"})
    subbedf = DataFrame.loc[DataFrame.type_name == "Substitution", :].groupby(a).time.max().reset_index()

    df = timesdf.merge(TeamTimesDF, on = matchteam + competition + season)
    df = df.merge(subdf, on = a, how = "outer")
    df = df.merge(subbedf, on = a, how = "outer")
    df = df.fillna(value = {"time_y": 0, "time": df.time_x})
    df = df.rename(columns = {"time_x": "minute_max", "time_y": "minute_start", "time": "minute_end"})

    df.loc[:, "minute_played"] = df.loc[:, "minute_end"] - df.loc[:, "minute_start"]

    return df    

In [13]:
def DFtype(DataFrame, style = "all"):
    a = (DataFrame.period < 5)
    if style == "NP":
        b = (DataFrame.shot_type_name != "Penalty")
    elif style == "OpenPlay":
        b = (DataFrame.shot_type_name.isin([np.NaN, "Open Play"]))
    elif style == "all":
        b = ()
    DF = DataFrame.loc[a & b, :]
    return DF

In [23]:
def TimeTotaliser(DataFrame):
    matchteam = ["match_id", "team_name"]
    matchteamplayer = matchteam + ["player_name"]
    playerteam = ["player_name", "team_name"]

    competition = ["competition_competition_name"]
    season = ["season_season_name"]
    
    if "player_name" in DataFrame.columns:
        aggfunc = {"minute_played": np.sum, "minute_max": np.sum, "match_id": "count"}
        a = playerteam
    else:
        aggfunc = {"time": np.sum, "match_id": "count"}
        a = ["team_name"]
    DF = PlayerTimesDF.groupby(a + competition + season).agg(aggfunc)
    
    return DF

In [None]:
def xGchainDF(DataFrame):
    aggFunc = {"shot_statsbomb_xg": "sum", "player_name": "unique", "id": "count",
              "season_season_name": (lambda x: x.values[0]),
              "competition_competition_name": (lambda x: x.values[0])}
    df = DataFrame.groupby(["match_id", "team_name", "possession"]).agg(aggFunc)
    Y = []
    for i, r in df.iterrows():
        for player in r.player_name:
            x = {"player_name": player, "team_name": i[1]}
            for j in competition + season + ["shot_statsbomb_xg"]:
                x[j] = r[j]
            Y.append(x)
    chainDF = pd.DataFrame(Y)
    
    return chainDF

In [None]:
def playerxgdf(DataFrame, TimesTotalDF):
    
    #xA
    assistDF = DataFrame.loc[DataFrame.pass_shot_assist == True, :]
    events = assistDF.pass_assisted_shot_id
    player = assistDF.loc[:, matchteamplayer + competition + season + ["location"]]
    xg = DataFrame.loc[(DataFrame.id.isin(events)), ["shot_statsbomb_xg", "shot_type_name", "shot_outcome_name", "player_name"]]
    xg = xg.rename(columns = {"player_name": "shot_player_name"})

    dfs = [player, xg]
    for df in dfs:
        df.index = range(len(df))
    xADF = pd.concat(dfs, sort = False, axis = 1)
    
    #xG chain
    chainDF = xGchainDF(DataFrame)
    
    X = []
    for df in [xADF, DataFrame, chainDF]:
        d = df.groupby(playerteam + competition + season).shot_statsbomb_xg.sum()
        X.append(d)

    f = pd.DataFrame()
    f["xG"] = X[1]
    f["xA"] = X[0]
    f = f.fillna(0)
    f["xGchain"] = X[2]
    f["xG+xA"] = f["xG"] + f["xA"]
    f["xGbuildup"] = f["xGchain"] - f["xG+xA"]
    f["minute_played"] = TimesTotalDF.minute_played

    cols = list(sorted((set(f.columns) - {"minute_played"})))

    for col in cols:
        f[col + "p90"] = 90 *f[col] / f["minute_played"]

    f = f.reset_index()
    
    return f

In [None]:
def coordchanger(x, y, horizontal = True):
    if not horizontal:
        dim = y, x
    else:
        dim = x, y
    
    return dim

In [None]:
def draw_pitch(l = 12, linecolor="white", pitchcolor="seagreen", horizontal = True, half = False, attack = True):
    x, y = l, l*2/3
    
    if horizontal:
        direction = "right"
    else:
        direction = "up"
    
    if half:
        x = l/2
    dim = coordchanger(x, y, horizontal=horizontal)
    
    fig, ax, plt = graph((dim))
    fig.patch.set_facecolor(pitchcolor)
    
    x1, x2 = -5, 125
    y1, y2 = -5, 85
    
    if half:
        if attack:
            x1 = 60
        else:
            x2 = 60
        
    lim1 = coordchanger(x1, y1, horizontal=horizontal)
    lim2 = coordchanger(x2, y2, horizontal=horizontal)
    xlim = lim1[0], lim2[0]
    ylim = lim1[1], lim2[1]
    plt.xlim(xlim)
    plt.ylim(ylim)
    
    rect = [[[0, 0], [60, 80]],
            [[0, 30], [6, 20]],
           [[0, 18], [18, 44]],
           [[-2, 36], [2, 8]],
            [[60, 0], [60, 80]],
           [[114, 30], [6, 20]],
           [[102, 18], [18, 44]],
           [[120, 36], [2, 8]]]
    if half:
        if attack:
            rect = rect[len(rect)//2:]
        else:
            rect = rect[:len(rect)//2]
    for i in range(len(rect)):
        r = rect[i]
        start = list(coordchanger(r[0][0], r[0][1], horizontal=horizontal))
        width, height = coordchanger(r[1][0], r[1][1], horizontal=horizontal)
        patch = pch.Rectangle(start, height = height, width = width, fill = False, edgecolor = linecolor, linewidth = 2)
    
        ax.add_patch(patch)
    
    circ = [(12, 40), (108, 40)]
    if half:
        if attack:
            circ = [circ[1]]
        else:
            circ = [circ[0]]
 
    for i in range(len(circ)):
        c = coordchanger(circ[i][0], circ[i][1], horizontal = horizontal)
        patch = pch.Circle(c, 0.4, fill = True, facecolor = linecolor)
        
        ax.add_patch(patch)
        
    wedg = [[(60, 40), 10, 90, 270],
            [(60, 40), 0.4, 90, 270],
            [(12, 40), 10, 310, 50],
           [(60, 40), 10, 270, 90],
           [(60, 40), 0.4, 270, 90],
           [(108, 40), 10, 130, 230]]
    
    if half:
        if attack:
            wedg = wedg[3:]
        else:
            wedg = wedg[:3]
    
    for i in range(len(wedg)):
        w = wedg[i]
        if not horizontal:
            w[2] += 90
            w[3] += 90
        c = coordchanger(w[0][0], w[0][1], horizontal = horizontal)
        patch = pch.Wedge(center = tuple(c), r = w[1], theta1 = w[2]%360, theta2 = w[3]%360, color = linecolor, width = (0.4/w[1])**0.5)
        ax.add_patch(patch)
    
    plt.axis('off')
    return fig, ax, plt   

In [None]:
def coordplot(x, y, horizontal=True, direction="right", half=False, attack = True):
    if horizontal:
        if direction == "right":
            y = 80 - y
            if half:
                if not attack:
                    x = 120 - x
        elif direction == "left":
            x = 120 - x
    else:
        if direction == "up":
            x, y = y, x
            if half:
                if not attack:
                    x = 120 - x
        elif direction == "down":
            x, y = 80 - y,  120 - x
            if half:
                if not attack:
                    x = 120 - x
    return x, y   

In [15]:
NPDF = DFtype(DF, style = "NP")

In [20]:
PlayerTimesDF = PlayerTimesDF(DF)