In [1]:
import pandas as pd
import numpy as np
import os
import json
import matplotlib.pyplot as plt
import matplotlib.patches as pch
import random
import time
import matplotlib.cm as cm
import bokeh

In [2]:
homedir = os.path.expanduser("~")

In [3]:
dataFolder = homedir+"\\Documents\\GitHub\\open-data\\data\\"
matchesFolder = dataFolder + "matches\\"
lineupsFolder = dataFolder + "lineups\\"
eventsFolder = dataFolder + "events\\"

In [4]:
def json_loads(filepath):
    with open(filepath, encoding="utf-8") as json_file:
        n = json.load(json_file)
    return n

In [5]:
def coordplot(x, y, horizontal=True, direction="right", half=False, attack = True):
    if horizontal:
        if direction == "right":
            y = 80 - y
            if half:
                if not attack:
                    x = 120 - x
        elif direction == "left":
            x = 120 - x
    else:
        if direction == "up":
            x, y = y, x
            if half:
                if not attack:
                    x = 120 - x
        elif direction == "down":
            x, y = 80 - y,  120 - x
            if half:
                if not attack:
                    x = 120 - x
    return x, y   

In [6]:
def tts(timestamp):
    n = 0
    for i in range(3):
        n += float(timestamp.split(":")[i]) * (60 ** (2-i))
    return n

def ttm(timestamp):
    n = tts(timestamp) / 60
    return n

In [7]:
def barxgchart(DataFrame, depth = 3):   
    try:
        ([1, 2, 3]).index(depth)
    except:
        print("depth must be in [1, 2, 3]")
        return
        
    fig, ax, plt = graph((10, 10))
    
    deep_list1 = ["xG", "xG+xA", "xGchain"]
    deep_list2 = ["xG", "xA", "xGbuildup"]
    
    df = DataFrame.sort_values((deep_list1[depth - 1] + "p90"), ascending = False)[:30]
    
    i = 0
    pieces = ["player_name"]
    rejects = []
    
    for r in ["team", "competition_competition", "season_season"]:
        r1 = r + "_name"
        if DataFrame[r1].nunique() > 1:
            pieces += [r1]
        else:
            rejects.append(r1)
            
    cmap = cm.get_cmap("viridis")

    y_pieces = df.loc[:, pieces].values
    y = [", ".join(n) for n in y_pieces]
    y_pos = np.arange(len(y), 0, -1)
    h = 0.5
    z = np.zeros(len(y))
    for i in range(depth):
        x = df[deep_list2[i] + "p90"].values
        plt.barh(y_pos, x, height = h, left = z, tick_label = y, label = deep_list2[i], color = cmap((i+0)/max(1, depth-1)))
        z += x
    
    a = deep_list1[depth - 1] + " per 90: "
    s = a + ", ".join([DataFrame[r].values[0] for r in rejects])
    d = "Minimum {} minutes".format(round(df.minute_played.min(), 1))
    sx = sum(plt.xlim())/3
    sy = len(y)+2
    plt.text(x = sx, y = sy+1, s = s, fontsize=12, ha="center")
    plt.text(x = sx, y = sy, s = d, fontsize=10, ha="center")

    ax.legend(loc = "lower right")
    plt.grid(True)
    plt.tight_layout()
    return fig, ax, plt

In [8]:
def playerxgdf(DataFrame, TimesTotalDF):
    
    #xA
    assistDF = DataFrame.loc[DataFrame.pass_shot_assist == True, :]
    events = assistDF.pass_assisted_shot_id
    player = assistDF.loc[:, matchteamplayer + competition + season + ["location"]]
    xg = DataFrame.loc[(DataFrame.id.isin(events)), ["shot_statsbomb_xg", "shot_type_name", "shot_outcome_name", "player_name"]]
    xg = xg.rename(columns = {"player_name": "shot_player_name"})

    dfs = [player, xg]
    for df in dfs:
        df.index = range(len(df))
    xADF = pd.concat(dfs, sort = False, axis = 1)
    
    #xG chain
    chainDF = xGchainDF(DataFrame)
    
    X = []
    for df in [xADF, DataFrame, chainDF]:
        d = df.groupby(playerteam + competition + season).shot_statsbomb_xg.sum()
        X.append(d)

    f = pd.DataFrame()
    f["xG"] = X[1]
    f["xA"] = X[0]
    f = f.fillna(0)
    f["xGchain"] = X[2]
    f["xG+xA"] = f["xG"] + f["xA"]
    f["xGbuildup"] = f["xGchain"] - f["xG+xA"]
    f["minute_played"] = TimesTotalDF.minute_played

    cols = list(sorted((set(f.columns) - {"minute_played"})))

    for col in cols:
        f[col + "p90"] = 90 *f[col] / f["minute_played"]

    f = f.reset_index()
    
    return f

In [9]:
def graph(figdim):
    fig, ax = plt.subplots(figsize=figdim, dpi = 150)
    ax.spines["top"].set_visible(False)
    ax.spines["bottom"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_visible(False)

    return fig, ax, plt

In [10]:
def coordchanger(x, y, horizontal = True):
    if not horizontal:
        dim = y, x
    else:
        dim = x, y
    
    return dim

In [11]:
def draw_pitch(l = 12, linecolor="white", pitchcolor="seagreen", horizontal = True, half = False, attack = True):
    x, y = l, l*2/3
    
    if horizontal:
        direction = "right"
    else:
        direction = "up"
    
    if half:
        x = l/2
    dim = coordchanger(x, y, horizontal=horizontal)
    
    fig, ax, plt = graph((dim))
    fig.patch.set_facecolor(pitchcolor)
    
    x1, x2 = -5, 125
    y1, y2 = -5, 85
    
    if half:
        if attack:
            x1 = 60
        else:
            x2 = 60
        
    lim1 = coordchanger(x1, y1, horizontal=horizontal)
    lim2 = coordchanger(x2, y2, horizontal=horizontal)
    xlim = lim1[0], lim2[0]
    ylim = lim1[1], lim2[1]
    plt.xlim(xlim)
    plt.ylim(ylim)
    
    rect = [[[0, 0], [60, 80]],
            [[0, 30], [6, 20]],
           [[0, 18], [18, 44]],
           [[-2, 36], [2, 8]],
            [[60, 0], [60, 80]],
           [[114, 30], [6, 20]],
           [[102, 18], [18, 44]],
           [[120, 36], [2, 8]]]
    if half:
        if attack:
            rect = rect[len(rect)//2:]
        else:
            rect = rect[:len(rect)//2]
    for i in range(len(rect)):
        r = rect[i]
        start = list(coordchanger(r[0][0], r[0][1], horizontal=horizontal))
        width, height = coordchanger(r[1][0], r[1][1], horizontal=horizontal)
        patch = pch.Rectangle(start, height = height, width = width, fill = False, edgecolor = linecolor, linewidth = 2)
    
        ax.add_patch(patch)
    
    circ = [(12, 40), (108, 40)]
    if half:
        if attack:
            circ = [circ[1]]
        else:
            circ = [circ[0]]
 
    for i in range(len(circ)):
        c = coordchanger(circ[i][0], circ[i][1], horizontal = horizontal)
        patch = pch.Circle(c, 0.4, fill = True, facecolor = linecolor)
        
        ax.add_patch(patch)
        
    wedg = [[(60, 40), 10, 90, 270],
            [(60, 40), 0.4, 90, 270],
            [(12, 40), 10, 310, 50],
           [(60, 40), 10, 270, 90],
           [(60, 40), 0.4, 270, 90],
           [(108, 40), 10, 130, 230]]
    
    if half:
        if attack:
            wedg = wedg[3:]
        else:
            wedg = wedg[:3]
    
    for i in range(len(wedg)):
        w = wedg[i]
        if not horizontal:
            w[2] += 90
            w[3] += 90
        c = coordchanger(w[0][0], w[0][1], horizontal = horizontal)
        patch = pch.Wedge(center = tuple(c), r = w[1], theta1 = w[2]%360, theta2 = w[3]%360, color = linecolor, width = (0.4/w[1])**0.5)
        ax.add_patch(patch)
    
    plt.axis('off')
    return fig, ax, plt   

In [12]:
def df_json(filepath):
    json = json_loads(filepath)
    matchlist = []
    halftimes = [0]

    for line in json:
        #info = {"home_team": json[0]["team"]["name"], "away_team": json[1]["team"]["name"],
        #"match_id": int(os.path.split(eventsfilepath)[-1].split(".")[0])}
        info = {}
        for r in line.keys():
            if isinstance(line[r], dict):
                for s in line[r].keys():
                    if isinstance(line[r][s], dict):
                        for t in line[r][s].keys():
                            info["_".join([r, s, t])] = line[r][s][t]
                    else:
                        info["_".join([r, s])] = line[r][s]
            else:
                info[r] = line[r]
        
        if "index" in line.keys():
            if line["type"]["name"] == "Half End":
                halftimes.append(ttm(line["timestamp"]))
            ht = halftimes[::2]
            info["time"] = ttm(line["timestamp"]) + sum(ht[:(info["period"])])
        matchlist.append(info)

    n = pd.DataFrame(matchlist)  
    
    return n

In [13]:
def xGchainDF(DataFrame):
    aggFunc = {"shot_statsbomb_xg": "sum", "player_name": "unique", "id": "count",
              "season_season_name": (lambda x: x.values[0]),
              "competition_competition_name": (lambda x: x.values[0])}
    df = DataFrame.groupby(["match_id", "team_name", "possession"]).agg(aggFunc)
    Y = []
    for i, r in df.iterrows():
        for player in r.player_name:
            x = {"player_name": player, "team_name": i[1]}
            for j in competition + season + ["shot_statsbomb_xg"]:
                x[j] = r[j]
            Y.append(x)
    chainDF = pd.DataFrame(Y)
    
    return chainDF

In [14]:
comps = json_loads(dataFolder + "competitions.json")
competitions = pd.DataFrame(comps)

In [19]:
def SmallDF(competition_id, season_id):
    dataframe_list = []
    fp = matchesFolder + "{}\\{}.json".format(competition_id, season_id)
    matches = df_json(fp)
    for j, match in matches.iterrows():
        fp2 = eventsFolder + "{}.json".format(match["match_id"])    
        df = df_json(fp2)
        df2 = pd.DataFrame(index = range(len(df)), data = [match.values.tolist()] * len(df), columns = match.index)
        df = pd.concat([df, df2], axis = 1)

        dataframe_list.append(df)
    
    DF = pd.concat(dataframe_list, sort=False)
    
    return DF

In [20]:
def BigDF(competitions):
    dataframe_list = []
    for i,row in competitions.iterrows():
        DF = SmallDF(row.competition_id, row.season_id)
        dataframe_list.append(DF)

    DF = pd.concat(dataframe_list, sort=False)    
    for i in ["", "end_"]:
        for j in ["x", "y"]:
            n = (["x", "y"]).index(j)
            DF[i + "location_" + j] = DF[i+"location"].dropna().apply(lambda x: x[n])
            
    return DF

In [22]:
dataframe_list = []
for i,row in competitions.iterrows():
    DF = SmallDF(row.competition_id, row.season_id)
    dataframe_list.append(DF)
    
DF = pd.concat(dataframe_list, sort=False) 

In [23]:
DF

Unnamed: 0,ball_receipt_outcome_id,ball_receipt_outcome_name,ball_recovery_recovery_failure,block_deflection,block_offensive,carry_end_location,clearance_aerial_won,clearance_body_part_id,clearance_body_part_name,clearance_head,...,dribble_no_touch,goalkeeper_lost_in_play,goalkeeper_success_out,shot_follows_dribble,goalkeeper_success_in_play,half_start_late_video_start,half_end_early_video_end,goalkeeper_saved_to_post,shot_kick_off,goalkeeper_penalty_saved_to_post
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,"[47.0, 42.0]",,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,"[51.0, 6.0]",,,,,...,,,,,,,,,,


In [25]:
BigDF = DF.loc[DF.period < 5, :]
NPDF = BigDF.loc[BigDF.shot_type_name != "Penalty", :]
OpenPlayDF = BigDF.loc[BigDF.shot_type_name.isin([np.NaN, "Open Play"]), :]

KeyError: MemoryError()

In [None]:
matchteam = ["match_id", "team_name"]
matchteamplayer = matchteam + ["player_name"]
playerteam = ["player_name", "team_name"]

competition = ["competition_competition_name"]
season = ["season_season_name"]

In [None]:
aggfunc = {"shot_statsbomb_xg": "sum", "id": "count", "location": (lambda x: list(x)),
          "home_team_home_team_name": (lambda x: x.values[0]), "away_team_away_team_name": (lambda x: x.values[0]),
           "season_season_name": (lambda x: x.values[0]), "time": (lambda x: list(x)),
          "competition_competition_name": (lambda x: x.values[0]), "related_events": (lambda x: list(x))}
groupings = matchteamplayer + ["shot_type_name"]
grouped = BigDF.groupby(groupings).agg(aggfunc)
g = grouped.reset_index()

In [None]:
TeamTimesDF = BigDF.groupby(matchteam+competition +season).time.max().reset_index()

In [None]:
holdingnamedf = BigDF.groupby(matchteamplayer+competition+season).time.max()
timesdf = pd.DataFrame(index = holdingnamedf.index, data = 0, columns = ["minute_played"]).reset_index()
subdf = BigDF.groupby(matchteam + competition + ["substitution_replacement_name"] + season).time.min().reset_index().rename(columns = {"substitution_replacement_name":"player_name"})
subbedf = BigDF.loc[BigDF.type_name == "Substitution", :].groupby(matchteamplayer+competition+season).time.max().reset_index()

df = timesdf.merge(TeamTimesDF, on = matchteam + competition + season)
df = df.merge(subdf, on = matchteamplayer+ competition + season, how = "outer")
df = df.merge(subbedf, on = matchteamplayer+ competition + season, how = "outer")
df = df.fillna(value = {"time_y": 0, "time": df.time_x})
df = df.rename(columns = {"time_x": "minute_max", "time_y": "minute_start", "time": "minute_end"})

df.loc[:, "minute_played"] = df.loc[:, "minute_end"] - df.loc[:, "minute_start"]

PlayerTimesDF = df

In [None]:
aggfunc = {"minute_played": np.sum, "minute_max": np.sum, "match_id": "count"}
PlayerTimesTotalDF = PlayerTimesDF.groupby(playerteam+competition + season).agg(aggfunc)

In [None]:
f = playerxgdf(OpenPlayDF, TimesTotalDF = PlayerTimesTotalDF)

In [None]:
bc = f.loc[f[competition[0]] == "La Liga"]
bct = max(270, bc.minute_played.median())
bcm = bc.loc[bc.minute_played > bct, :]

fig, ax, plt = barxgchart(bcm, depth = 3)
plt.savefig(homedir + "\\Documents\\visualisations\\xgchain.png")

In [None]:
def matchloc(matchDataFrame):
    DataFrame = matchDataFrame
    fig, ax, plt = draw_pitch(linecolor="grey", pitchcolor="white")

    textx = sum(plt.xlim())/2
    deltax = textx/10
    ylim = plt.ylim()[1]
    
    if ylim > 100:
        texty = 135
    else:
        texty = 87
    deltay = texty/30
    text = "k"
    xg = "shot_statsbomb_xg"
    cmap = cm.get_cmap("Spectral")

    for team in DataFrame.possession_team_name.unique():
        shotdf = DataFrame.loc[(DataFrame.team_name == team) & (DataFrame.type_name == "Shot") & (DataFrame.shot_outcome_name != "Goal") & (DataFrame.period < 5)]
        goaldf = DataFrame.loc[(DataFrame.team_name == team) & (DataFrame.type_name == "Shot") & (DataFrame.shot_outcome_name == "Goal") & (DataFrame.period < 5)]
        ogdf = DataFrame.loc[(DataFrame.possession_team_name == team) & (DataFrame.type_name == "Own Goal Against")]
        for df in [shotdf, goaldf]:
            if team == shotdf.home_team.unique():
                direction = "right"
                col = "C1"
                piece = -1
                cmap = cm.get_cmap("Spectral")
            else:
                direction = "left"
                col = "C0"
                piece = 1
                cmap = cm.get_cmap("Spectral")
                
            x1 = np.array([n[0] for n in df.location.values])
            y1 = np.array([n[1] for n in df.location.values])

            x, y = coordplot(x1, y1, direction = direction)
            
            size = 100
            if df.shot_outcome_name.any() == "Goal":
                marker = "s"
            else:
                marker = "o"
            plt.scatter(x, y, s = size, edgecolors=text, linewidths=0.9, marker = marker, c = cmap(df[xg]/0.5), alpha= 0.5)

        titling(fig, ax, plt, col, textx + piece*deltax, texty, deltay, [team, len(goaldf)+len(ogdf), str(shotdf[xg].sum()+goaldf[xg].sum())[:4]], direction)

        
    central = [" v ", " Score ", " xG Total "]
    titling(fig, ax, plt, "black", textx, texty, deltay, central, "center")

    plt.tight_layout()
    return fig, ax, plt

In [None]:
def matchloc(DataFrame, match_id):
    
    df = DataFrame.loc[DataFrame.match_id == match_id, :]
    fig, ax, plt = draw_pitch(linecolor="grey", pitchcolor="white")

    shots = df.groupby(["team_name", "shot_outcome_name"])
    return shots

In [None]:
shots = (BigDF, 22014)

In [None]:
for match in BigDF.match_id.unique():
    df = BigDF.loc[BigDF.match_id == match, :]
    fig, ax, plt = draw_pitch(linecolor="grey", pitchcolor="white")

    aggfunc = {"shot_statsbomb_xg": (lambda x: list(x)), "location": (lambda x: list(x))}
    shots = df.groupby(["team_name", "shot_outcome_name"]).agg(aggfunc).reset_index()

    cmap = cm.get_cmap("Spectral")

    teams = [df[(n + "_team_" + n + "_team_name")].values[0] for n in ["home", "away"]]
    for team in teams:
        things = [(shots.shot_outcome_name == "Goal"), (shots.shot_outcome_name != "Goal")]
        for thing in things:
            d = shots.loc[(shots.team_name == team)&thing, :]
            if len(d) > 0:
                xy = d.location.sum()
                x = np.array([n[0] for n in d.location.sum()])
                y = np.array([n[1] for n in d.location.sum()])
                if team == teams[0]:
                    direction = "left"
                else:
                    direction = "right"

                x, y = coordplot(x, y, direction = direction)
                s = np.array(d.shot_statsbomb_xg.sum())
                color = cmap(s/0.8)
                if d.shot_outcome_name.any() == "Goal":
                    marker = "s"
                    #color = "pink"
                else:
                    marker = "o"


                plt.scatter(x, y, s = 100, marker = marker, color = color, alpha = 0.5, edgecolors = "black")

    
    plt.text(x = 60, y = 90, s = "v", ha = "center")
    plt.text(x = 55, y = 90, s = teams[0], ha = "right")
    plt.text(x = 65, y = 90, s = teams[1], ha = "left")

    plt.text(x = 60, y = 87, s = "Score", ha = "center")
    plt.text(x = 55, y = 87, s = df["home_score"].values[0], ha = "right")
    plt.text(x = 65, y = 87, s = df["away_score"].values[0], ha = "left")

    plt.text(x = 60, y = 84, s = "xG", ha = "center")
    plt.text(x = 55, y = 84, s = round(sum(shots.loc[shots.team_name == teams[0], "shot_statsbomb_xg"].sum()), 2), ha = "right")
    plt.text(x = 65, y = 84, s = round(sum(shots.loc[shots.team_name == teams[1], "shot_statsbomb_xg"].sum()), 2), ha = "left")

    plt.text(x = 60, y = 81, s = "Shots", ha = "center")
    plt.text(x = 55, y = 81, s = len(shots.loc[shots.team_name == teams[0], "shot_statsbomb_xg"].sum()), ha = "right")
    plt.text(x = 65, y = 81, s = len(shots.loc[shots.team_name == teams[1], "shot_statsbomb_xg"].sum()), ha = "left")
    plt.show()

In [None]:
aggfunc = {"shot_statsbomb_xg": (lambda x: list(x)), "location": (lambda x: list(x))}
x = BigDF.groupby(["match_id", "team_name", "shot_outcome_name"]).agg(aggfunc)

In [None]:
x

In [None]:
for col in BigDF.columns:
    print(col)