In [None]:
import os
import json

import math
import numpy as np
import pandas as pd
import numba
import dask

import sklearn

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as pch
import matplotlib.cm as cm
import matplotlib.image as mpimg

import time
import random
from PIL import Image

In [None]:
homedir = os.path.expanduser("~")
dataFolder = os.path.join(homedir, "Documents",  "Github", "open-data", "data")

In [None]:
competitions = os.path.join(dataFolder, "competitions.json")
events = os.path.join(dataFolder, "events")
lineups = os.path.join(dataFolder, "lineups")
matches = os.path.join(dataFolder, "matches")
logo = mpimg.imread(os.path.join(homedir, "Documents", "Github", "open-data", "img", "statsbomb-logo.jpg"))

In [None]:
def tts(timestamp):
    broken = timestamp.split(":")
    s = sum([float(broken[i]) * (60 ** (2 - i)) for i in range(len(broken))])
    return s

def ttm(timestamp):
    m = tts(timestamp) / 60
    return m    

In [None]:
def positivenegative(a):
    if a > 0:
        return 1
    elif a < 0:
        return -1
    else:
        return 0

In [None]:
def json_loads(filepath):
    with open(filepath, encoding="utf-8") as json_file:
        loaded = json.load(json_file)
    return loaded

In [None]:
def json_dataFrame(filepath):
    json_file = json_loads(filepath)
    df = pd.json_normalize(json_file)
    
    check_columns = {"location", "period", "timestamp"}
    if check_columns.issubset(df.columns): 
        x = df.groupby(["period"])["timestamp"].last()
        x[0] = "00:00:00"
        vfunc = np.vectorize(ttm)
        y = vfunc(x.sort_index().values).cumsum()
        for i in x.index[:(-1)]:
            df.loc[df["period"] == i, "time"] = [ttm(n) + y[i-1] for n in df.loc[df["period"] == i, "timestamp"].tolist()]
        
        elCols = [n for n in df.columns if "end_location" in n]
        els = [df[col].dropna() for col in elCols]
        df["end_location"] = pd.concat(els)
        
        for pre in ["end_", ""]:
            if pre == "end_":
                suffs = ["x", "y", "z"]
            else:
                suffs = ["x", "y"]
            cols = ["{}location_{}".format(pre, suff) for suff in suffs]
            lox = df["{}location".format(pre)].dropna()
            
            df[cols] = pd.DataFrame(lox.tolist(), columns = cols, index = lox.index)
            
        fna = {"pass.outcome.name": "Complete", "pass.cross": False, "pass.type.name": "Regular"}
        df.loc[df["type.id"] == 30, :] = df.loc[df["type.id"] == 30, :].fillna(fna)
        
        #Pressures, axis = 1
        pressures = df["type.id"] == 17
        poss2 = df.groupby("possession")["time"].first().tolist()
        timetest = df["time"] == -1
        for t in poss2:
            timetest |= (df["time"] >= t - 1/12) & (df["time"] <= t)
        
        df.loc[pressures & timetest, "pressure_regain"] = 1
        
        #Box
        inthebox = (df["location_x"] > 102) & (df["location_y"] > 18) & (df["location_y"] < 62)
        outthebox = ~inthebox
        df.loc[inthebox, "inthebox"] = 1
        
        #Progress
        tests = {}
        tests["progress1"] = (df["location_x"] >= 60) & (df["end_location_x"] - df["location_x"] >= 10)
        tests["deepprogress"] = (df["location_x"] <= 80) & (df["end_location_x"] > 80)
        tests["boxentry"] = (df["end_location_x"] > 102) & (df["end_location_y"] > 18) & (df["end_location_y"] < 62) & outthebox
        for prog in [30, 43]:
            if prog == 30:
                prg = "Pass"
                test1 = (df["type.id"] == prog) & (df["pass.outcome.name"] == "Complete")
            else:
                prg = "Carry"
                test1 = df["type.id"] == prog
                
            for key in tests.keys():
                test2 = test1 & tests[key]

                df.loc[test2, "_".join([prg, key])] = 1
                
        #Touches
        touches = df["type.id"].isin([16, 6, 3, 30, 38, 9])
        if "dribble.outcome.id" in df.columns:
            touches |= (df["type.id"] == 14) & (df["dribble.outcome.id"] == 9)
        intlist = [13, 14, 16, 17]
        if "interception.outcome.id" in df.columns:
            touches |= (df["type.id"] == 10) & (df["interception.outcome.id"].isin(intlist))
        if "duel.outcome.id" in df.columns:
            touches |= (df["type.id"] == 4) & (df["duel.outcome.id"].isin(intlist))
        if "foul_won.defensive" in df.columns:
            touches |= (df["type.id"] == 21) & (df["foul_won.defensive"] != True)
        else:
            touches |= (df["type.id"] == 21)
        if "foul_committed.offensive" in df.columns:
            touches |= (df["type.id"] == 22) & (df["foul_committed.offensive"] == True)
        fiftylist = [147, 148]
        if "50_50.outcome.id" in df.columns:
            touches |= (df["type.id"] == 33) & (df["50_50.outcome.id"].isin(fiftylist))
        gklist = [48, 51, 52, 53, 59, 16, 17, 13, 14, 117]
        if "goalkeeper.outcome.id" in df.columns:
            touches |= (df["type.id"] == 23) & (df["goalkeeper.outcome.id"].isin(gklist))
            
        df.loc[touches, "touch"] = 1
                           
        #match_id
        match_id = os.path.basename(filepath).split(".json")[0]
        df.loc[:, "match_id"] = match_id
        
    return df

In [None]:
def json_dataFrame1(filepath):
    json_file = json_loads(filepath)
    df = pd.json_normalize(json_file)
    
    check_columns = {"location", "period", "timestamp"}
    if check_columns.issubset(df.columns): 
        df.loc[:, "period_time"] = [ttm(n) for n in df.loc[:, "timestamp"].tolist()]
        x = df.groupby(["period"])["period_time"].last()
        x[0] = 0        
        y = x.sort_index().cumsum()

        for i in y.index:
            df.loc[df["period"] == (i+1), "half_start"] = y[i]
        df.loc[:, "time"] = df.loc[:, "period_time"] + df.loc[:, "half_start"]
        
        elCols = [n for n in df.columns if "end_location" in n]
        els = [df[col].dropna() for col in elCols]
        df["end_location"] = pd.concat(els)
        
        for pre in ["end_", ""]:
            if pre == "end_":
                suffs = ["x", "y", "z"]
            else:
                suffs = ["x", "y"]
            cols = ["{}location_{}".format(pre, suff) for suff in suffs]
            lox = df["{}location".format(pre)].dropna()
            
            df[cols] = pd.DataFrame(lox.tolist(), columns = cols, index = lox.index)
            
        fna = {"pass.outcome.name": "Complete", "pass.cross": False, "pass.type.name": "Regular"}
        df.loc[df["type.id"] == 30, :] = df.loc[df["type.id"] == 30, :].fillna(fna)
        
        #Pressures, axis = 1
        pressures = df["type.id"] == 17
        poss2 = df.groupby("possession")["time"].first().tolist()
        timetest = df["time"] == -1
        for t in poss2:
            timetest |= (df["time"] >= t - 1/12) & (df["time"] <= t)
        
        df.loc[pressures & timetest, "pressure_regain"] = 1
        
        #Box
        inthebox = (df["location_x"] > 102) & (df["location_y"] > 18) & (df["location_y"] < 62)
        outthebox = ~inthebox
        df.loc[inthebox, "inthebox"] = 1
        
        #Progress
        tests = {}
        tests["progress1"] = (df["location_x"] >= 60) & (df["end_location_x"] - df["location_x"] >= 10)
        tests["deepprogress"] = (df["location_x"] <= 80) & (df["end_location_x"] > 80)
        tests["boxentry"] = (df["end_location_x"] > 102) & (df["end_location_y"] > 18) & (df["end_location_y"] < 62) & outthebox
        for prog in [30, 43]:
            if prog == 30:
                prg = "Pass"
                test1 = (df["type.id"] == prog) & (df["pass.outcome.name"] == "Complete")
            else:
                prg = "Carry"
                test1 = df["type.id"] == prog
                
            for key in tests.keys():
                test2 = test1 & tests[key]

                df.loc[test2, "_".join([prg, key])] = 1
         
        match_id = os.path.basename(filepath).split(".json")[0]
        df.loc[:, "match_id"] = match_id
        
    return df

In [None]:
competitionsDataFrame = json_dataFrame(competitions)

In [None]:
%timeit pd.json_normalize(json_loads(events + "\\{}.json".format(18243)))

In [None]:
%timeit json_dataFrame1("{}\\{}.json".format(events, 18243))

In [None]:
%timeit json_dataFrame("{}\\{}.json".format(events, 18243))

In [None]:
match_list = []
for comp_index, comp_row in competitionsDataFrame.iterrows():

    path = os.path.join(matches, str(comp_row["competition_id"]), str(comp_row["season_id"]) + ".json")
    matchesDataFrame = json_dataFrame(path)
    match_list.append(matchesDataFrame)
    
    print(comp_row["competition_name"], comp_row["season_name"])
        
#eventsDataFrame
allmatchesDataFrame = pd.concat(match_list)
allmatchesDataFrame.index = range(len(allmatchesDataFrame))

In [None]:
def NameColumn(row):
    if row["player_nickname"] == None:
        name = row["player_name"]
    else:
        name = row["player_nickname"]
    return name.strip()

In [None]:
def ShortName(name):
    split = name.split(" ")
    l = len(split)
    if l == 1:
        return name
    elif l >= 2:
        if "van" in split:
            s = split.index("van")
        else:
            s = -1
        return split[0][0] + ". " + " ".join(split[s:])

In [None]:
def draw_pitch(pitch_color = "white", line_color = "black", y_size = 10):
    x_size = y_size * 1.5
    fig, ax = plt.subplots(figsize = (x_size, y_size))
    ax.axis("off")
    fig.patch.set_facecolor(pitch_color)
    for side in ["top", "bottom", "left", "right"]:
        ax.spines[side].set_visible(False)   
    
    x1, x2 = -5, 125
    y1, y2 = -5, 85
    z = 1
    
    plt.xlim(x1, x2)
    plt.ylim(y1, y2)
    
    Pitch = pch.Rectangle([0, 0], width = 120, height = 80, fill = False, color = line_color, zorder = z)
    
    Halfway = pch.ConnectionPatch([60, 0], [60, 80], "data", "data", color = line_color, zorder = z)
    CentreCircle = pch.Circle((60, 40), 10, color = line_color, fill = False, zorder = z)
    CentreSpot = pch.Circle((60, 40), 0.3, color = line_color, fill = True, zorder = z)
    
    LeftPenalty = pch.Rectangle([0, 18], width = 18, height = 44, color = line_color, fill = False, zorder = z)
    RightPenalty = pch.Rectangle([102, 18], width = 18, height = 44, color = line_color, fill = False, zorder = z)
    
    LeftPenSpot = pch.Circle((12, 40), 0.3, color = line_color, fill = True, zorder = z)
    RightPenSpot = pch.Circle((108, 40), 0.3, color = line_color, fill = True, zorder = z)
    
    LeftSixYard = pch.Rectangle([0, 30], width = 6, height = 20, color = line_color, fill = False, zorder = z)
    RightSixYard = pch.Rectangle([114, 30], width = 6, height = 20, color = line_color, fill = False, zorder = z)
    
    LeftGoal = pch.Rectangle([-3, 36], width = 3, height = 8, color = line_color, fill = False, zorder = z)
    RightGoal = pch.Rectangle([120, 36], width = 3, height = 8, color = line_color, fill = False, zorder = z)
    
    LeftArc = pch.Arc((12, 40), height = 17.8, width = 17.8, angle = 0, theta1 = 310, theta2 = 50, color = line_color, zorder = z)
    RightArc = pch.Arc((108, 40), height = 17.8, width = 17.8, angle = 0, theta1 = 130, theta2 = 230, color = line_color, zorder = z)
    
    for i in [Pitch, Halfway, CentreCircle, CentreSpot,
              LeftPenalty, RightPenalty, LeftPenSpot, RightPenSpot,
              LeftSixYard, RightSixYard, LeftGoal, RightGoal, LeftArc, RightArc]:
        ax.add_patch(i)
        
    return fig, ax

In [None]:
def LineupDataFrame(match_id):
    lpath = os.path.join(lineups, str(match_id) + ".json")
    lineup = json_loads(lpath)
    llist = []
    for i in range(len(lineup)):
        df = pd.json_normalize(lineup[i]["lineup"])
        df.loc[:, "team.name"] = lineup[i]["team_name"]
        df.loc[:, "team.id"] = lineup[i]["team_id"]
        llist.append(df)
    ldf = pd.concat(llist)

    ldf.loc[:, "Name"] = ldf.apply(NameColumn, axis = 1)
    ldf.loc[:, "shortName"] = [ShortName(n) for n in ldf.loc[:, "Name"]]
    return ldf

In [None]:
def TwoTeamOneGraph(matchdf, lineupdf,  overdf, graph):
    df = matchdf.loc[matchdf["period"] < 5, :].copy()
    ldf = lineupdf

    overview = overdf.loc[overdf["match_id"] == int(df.at[0, "match_id"]), :]
    overviewdict = overview.to_dict(orient = "records")[0]    
    
    home_team = overviewdict["home_team.home_team_name"]
    away_team = overviewdict["away_team.away_team_name"]
    teams = [home_team, away_team]
    
    hat = {}
    hat["home"] = df["team.name"] == home_team
    hat["away"] = ~(hat["home"])
    shots = df["type.id"] == 16
    goals = df["shot.outcome.id"] == 97
    ogs = df["type.name"] == 20
    so = matchdf["period"] == 5
    
    cmap = "RdYlGn"
    
    if graph.lower() == "shotmap":
        fig, ax = draw_pitch()
        scatter = {"size": 100, "alpha": 0.6}
        lox = {"home": [0, 80], "away": [120, 0]}
        marks = {"shots": "o", "goals": "s", "ogs": "x"}

        things = {}
        for ha in hat.keys():
            t = hat[ha]
            things["{}_shots".format(ha)] = df.loc[t & shots & ~goals, :]
            things["{}_goals".format(ha)] = df.loc[t & goals, :]
            things["{}_ogs".format(ha)] = df.loc[~t & ogs, :]
            things["{}_so".format(ha)] = matchdf.loc[so & goals, :]
            

        for ha in ["home", "away"]:
            for sg in ["shots", "goals"]:
                thing = things["_".join([ha, sg])]
                color = thing["shot.statsbomb_xg"] / 0.8
                ax.scatter((lox[ha][0] - thing["location_x"]).abs(), (lox[ha][1] - thing["location_y"]).abs(), 
                            s = scatter["size"], marker = marks[sg], zorder = 2, alpha = scatter["alpha"], c = color,
                            cmap = cmap, edgecolor = "black", vmin = 0, vmax = 1)
            
            thing = things["_".join([ha, "ogs"])]
            if ha == "home": notha = "away"
            else: notha = "home"
            ax.scatter((lox[notha][0] - thing["location_x"]).abs(), (lox[notha][1] - thing["location_y"]).abs(),
                    s = scatter["size"], marker = "x", zorder = 2, alpha = scatter["alpha"])
        
        norm = mpl.colors.Normalize(vmin = 0, vmax = 0.8)
        fig.colorbar(ax = ax, mappable = cm.ScalarMappable(cmap = cmap, norm = norm), shrink = 0.7)
            
        textLoc = {"start_y": 83, "inc_y": 3, "start_x": 60, "inc_x": 5}
    
    elif graph.lower() == "step":
        halves = df.loc[df["type.id"] == 34, "time"].unique()
        redcards = df.loc[df["foul_committed.card.name"].isin(["Second Yellow", "Red Card"]), ["time", "player.id", "team.name"]]
        
        tests2 = goals | ogs
        
        fig, ax = plt.subplots(figsize = (12, 9)) 
        
        homeaway = ["home", "away"]
        start = 0
        for i in [0, 1]:
            team = teams[i]
            color = "C{}".format(i)
            
            ha = homeaway[i]
            
            tdf = df.loc[(shots & hat[ha])|(ogs & hat[homeaway[1-i]]), :].copy()
            tdf["cumsum"] = tdf["shot.statsbomb_xg"].cumsum()
            
            x = [0] + tdf["time"].tolist() + [halves.max()]
            y = [0] + tdf["cumsum"].tolist() + [tdf["cumsum"].max()]
        
            ax.step(x, y, where = "post", color = color)
            
            gdf = tdf.loc[tests2, :]
            ax.scatter(gdf.time, gdf["cumsum"], color = color)
            for i, row in gdf.iterrows():
                name = ldf.loc[ldf["player_id"] == row["player.id"], "shortName"].tolist()[0]
                if row["shot.type.name"] == "Penalty":
                    name += " (P)"
                if row["type.name"] == "Own Goal Against":
                    name += " (OG)"
                ax.annotate(xy = (row["time"] - 0.5 , row["cumsum"] * 1.02), s = name, horizontalalignment = "right")

            if max(y) > start:
                start = max(y)
        
        for side in ["top", "bottom", "left", "right"]:
            ax.spines[side].set_visible(False)

        for half in halves:
            ax.axvline(half, color = "C2")
        
        for i, row in redcards.iterrows():
            ax.axvline(row["time"], color = "red")
            name = "{} (RC)".format(ldf.loc[ldf["player_id"] == row["player.id"], "shortName"].tolist()[0])
            color = "C{}".format(teams.index(row["team.name"]))
            ax.annotate(xy = (row["time"] - 0.5, start * 0.9), s = name, color = color, horizontalalignment = "right")
            
        ax.axvline(color = "k")
        ax.axhline(color = "k")

        xticks = range(0, int(halves.max()), 15)
        ax.set_xticks(xticks)
        ax.grid(True)
        
        textLoc = {"start_y": start * 1.1, "inc_y": start/12, "start_x": df.time.max()/2, 
                   "inc_x": df.time.max()/24, "hteamc": "C0", "ateamc": "C1"}
        
        
    grouped = df.groupby("team.name")
    xgs = grouped["shot.statsbomb_xg"].sum()
    shts = grouped["type.name"].value_counts().unstack().loc[:, "Shot"]

    texts = [[str(round(xgs[home_team], 2)), "xG", str(round(xgs[away_team], 2))], 
            [int(shts[home_team]), "Shots", int(shts[away_team])],
            [overviewdict["home_score"], "Score", overviewdict["away_score"]],
            [home_team, "v", away_team]]

    for text in texts:
        if text[0] == home_team: f = 12
        else: f = 10
        ax.text(textLoc["start_x"] - textLoc["inc_x"], textLoc["start_y"], text[0], horizontalalignment = "right", fontsize = f, weight = "bold")
        ax.text(textLoc["start_x"], textLoc["start_y"], text[1], horizontalalignment = "center", fontsize = f)
        ax.text(textLoc["start_x"] + textLoc["inc_x"], textLoc["start_y"], text[2], horizontalalignment = "left", fontsize = f, weight = "bold")
        textLoc["start_y"] += textLoc["inc_y"]

    ax.text(textLoc["start_x"], textLoc["start_y"], overviewdict["match_date"], horizontalalignment = "center")   
    ax.text(textLoc["start_x"], textLoc["start_y"] + textLoc["inc_y"], overviewdict["competition.competition_name"], horizontalalignment = "center")

    fig.tight_layout()

    return fig, ax
        

In [None]:
def OneTeamOneGraph(team_name, matchdf, lineupdf,  overdf, graph):
    path = os.path.join(events, str(match_id) + ".json")   
    df = matchdf.loc[matchdf["period"] < 5, :].copy()
    
    overview = overdf.loc[overdf["match_id"] == match_id, :]
    overviewdict = overview.to_dict(orient = "records")[0]
    
    home_team = overviewdict["home_team.home_team_name"]
    away_team = overviewdict["away_team.away_team_name"]
    teams = [home_team, away_team]
    
    ldf = lineupdf
    
    cmap_name = "RdYlGn"
    cmap = cm.get_cmap(cmap_name)
    
    teamtest = (df["team.name"] == team_name)

    if graph.lower() == "passmap":
        fig, ax = draw_pitch(pitch_color = "white", line_color = "black", y_size = 8)
    
        brdf = df.loc[teamtest & (df["type.id"] == 42) & (df["ball_receipt.outcome.id"] != 9), : ]
        pdf = df.loc[teamtest & (df["type.id"] == 30) & (df["pass.outcome.name"] == "Complete"), : ]

        location = brdf.groupby("player.id")[["location_x", "location_y"]].median()
        size = pdf.groupby("player.id")["id"].count()

        newdf = pd.concat([location, size], axis =1 )
       
        if team_name == home_team:
            newdf["x"] = newdf["location_x"]
            newdf["y"] = 80 - newdf["location_y"]
            
            ax.arrow(20, 40, 80, 0, width = 10, facecolor = "white", alpha = 0.25,
                     zorder = 1, length_includes_head = True, head_width = 20, head_length = 22, edgecolor = "k")
        else:
            newdf["x"] = 120 - newdf["location_x"]
            newdf["y"] = newdf["location_y"]
            
            ax.arrow(100, 40, -80, 0, width = 10, facecolor = "white", alpha = 0.25, 
                     zorder = 1, length_includes_head = True, head_width = 20, head_length = 22, edgecolor = "k")
            
            
        ax.scatter(newdf["x"], newdf["y"], s = 15 * newdf["id"], c = "white", edgecolor = "k", zorder = 2)

        for i, row in newdf.iterrows():
            if row["id"] > 0:
                name = ldf.loc[ldf["player_id"] == i, "shortName"].values[0] 
                ax.annotate(xy = (row[["x", "y"]] - np.array([0, 4])), s = name, horizontalalignment = "center", zorder = 3)

        pairings = pdf.groupby(["player.id", "pass.recipient.id"])["id"].count().reset_index()
        for i, row in pairings.iterrows():
            if row["id"] >= 4:
                start = newdf.loc[row["player.id"], ["x", "y"]]
                end = newdf.loc[row["pass.recipient.id"], ["x", "y"]]
                diff = (end - start)
                angle1 = (math.atan2(diff["y"], diff["x"]))
                angle2 = (angle1  + 0.5 * math.pi) % (2 * math.pi)
                dz = [0.4 * math.cos(angle2), 0.3 * math.sin(angle2)]

                alpha = min(1, row["id"] / 30)
                ax.arrow(start["x"] + dz[0], start["y"] + dz[1], diff["x"]* 0.9, diff["y"] * 0.86, width = 0.9, 
                           alpha = alpha, zorder = 1, shape = "right", color = cmap(row["id"]/30))
        
    
    elif graph.lower() == "pressureheatmap":
        no_bins_x, no_bins_y = 6, 4
        no_bins = no_bins_x * no_bins_y
        time = df.time.max()

        press = df["type.id"] == 17
        presses = 174/no_bins
        v = presses

        df["x_bin"] = pd.cut(df.location_x, bins = np.linspace(0, 120, 1 + no_bins_x), right = True)
        df["y_bin"] = pd.cut(df.location_y, bins = np.linspace(0, 80, 1 + no_bins_y), right = True)

        fig, ax = draw_pitch(line_color = "k", pitch_color = "white")
        pressures = df.loc[teamtest & press, :]
        showim = ((pressures.groupby(["y_bin", "x_bin"])["id"].count() - presses) * 90 / time).unstack()

        if team_name == home_team:
            x = pressures["location_x"]
            y = 80 - pressures["location_y"]
            color = "C0"
        else:
            x = pressures["location_x"]
            y = 80 - pressures["location_y"]
            color = "C1"
            

        ax.scatter(x, y, c = "k")
        ax.imshow(showim, extent = [0, 120, 0, 80], vmin = -1 * v, vmax = v, cmap = cmap_name, alpha = 0.5)
        
        norm = mpl.colors.Normalize(vmin = -1* presses, vmax = presses)
        fig.colorbar(cm.ScalarMappable(cmap=cmap_name, norm = norm), ax=ax, shrink = 0.7)
        
        ax.arrow(20, -4, 80, 0, width = 0.75, color = color)


    texts = [overviewdict["home_score"], "Score", overviewdict["away_score"]], [home_team, "v", away_team]
    start = 83
    inc = 3
    for text in texts:
        if text[0] == home_team: f = 12
        else: f = 10
        if text[0] == team_name: color = ["C0", "k", "k"]
        elif text[2] ==  team_name: color = ["k", "k", "C1"]
        else: color = 3 * ["k"]
        ax.text(55, start, text[0], horizontalalignment = "right", fontsize = f, weight = "bold", color = color[0])
        ax.text(60, start, text[1], horizontalalignment = "center", fontsize = f, color = color[1])
        ax.text(65, start, text[2], horizontalalignment = "left", fontsize = f, weight = "bold", color = color[2])
        start += inc

    ax.text(60, start, overviewdict["match_date"], horizontalalignment = "center")   
    ax.text(60, start + inc, overviewdict["competition.competition_name"], horizontalalignment = "center")
    
    fig.tight_layout()
        
    return fig, ax
 

In [None]:
#def textAdd([fig, ax, plt], matchdf, overdf, textLoc):
    

In [None]:
def MatchPlots(match_id, overdf):
    df = json_dataFrame("{}//{}.json".format(events, match_id))
    ldf = LineupDataFrame(match_id)
    for g in ["shotmap", "step"]:
        fig, ax = TwoTeamOneGraph(matchdf = df, lineupdf = ldf, overdf = overdf, graph = g)
        plt.show()

    for g in ["passmap", "pressureheatmap"]:
        teams = df["team.name"].unique()
        for team in teams:
            fig, ax = OneTeamOneGraph(team_name = team, matchdf = df, lineupdf = ldf, overdf = overdf, graph = g)
            plt.show()      

In [None]:
allmatchesDataFrame["match_id"].values

In [None]:
def gtz(number):
    if number > 0:
        return 1
    return 0

In [None]:
def results(a, b):
    if a > b:
        return ["win", "loss"]
    elif b > a:
        return ["loss", "win"]
    else:
        return ["draw", "draw"]   

In [None]:
def simulate_match2(df, iterations = 1000):
    period = df["period"] < 5
    shots = df["type.name"] == "Shot"
    teams = df.loc[[0, 1], "team.id"].tolist()
    
    sdf = df.loc[period & shots, ["shot.statsbomb_xg", "team.id"]]
    shots = {teams[0]: sdf.loc[df["team.id"] == teams[0], "shot.statsbomb_xg"].tolist(),
             teams[1]: sdf.loc[df["team.id"] == teams[1], "shot.statsbomb_xg"].tolist()}

    points = {teams[0]: [], teams[1]: []}

    for i in range(iterations):
        goals = {}
        for team in teams:
            diff = [n >= random.random() for n in shots[team]]

            goals[team] = sum(diff)

        if goals[teams[0]] > goals[teams[1]]:
            points[teams[0]].append(3)
        elif goals[teams[1]] > goals[teams[0]]:
            points[teams[1]].append(3)
        else:
            points[teams[0]].append(1)
            points[teams[1]].append(1) 

    l = []
    for team in teams:
        d = {}
        #d["xg"] = sum(shots[team])
        d["sim_win"] = points[team].count(3) / iterations
        d["sim_draw"] = points[team].count(1) / iterations
        d["sim_loss"] = 1 - d["sim_win"] - d["sim_draw"]
        d["xpoints"] = 3 * d["sim_win"] + d["sim_draw"]
        l.append(d)
        
    return pd.DataFrame(l, index = teams)
    

In [None]:
clgames = allmatchesDataFrame.loc[allmatchesDataFrame["competition.competition_name"] == "Champions League", "match_id"].tolist()
np.random.shuffle(clgames)
for match_id in clgames:
    print(match_id)
    MatchPlots(match_id, overdf = allmatchesDataFrame) 
    break
   

In [None]:
match_ids = allmatchesDataFrame["match_id"].values
match_choices = np.random.choice(match_ids, size = 1)

for match_id in match_choices:
    print(match_id)
    MatchPlots(match_id, overdf = allmatchesDataFrame)

In [None]:
def xGchain(df):
    noPen = df.loc[df["shot.type.id"] != 88, :]
    a = noPen.groupby(["possession_team.id", "possession", "team.id", "player.id"])["index"].count().reset_index()
    b = noPen.groupby(["possession"])["shot.statsbomb_xg"].max()
    c = a.merge(b, on = "possession", how = "outer")
    d = c.loc[(c["possession_team.id"] == c["team.id"]) & (c["shot.statsbomb_xg"] > 0), :]
    rdf = pd.DataFrame(d.groupby("player.id")["shot.statsbomb_xg"].sum())
    rdf.rename(columns = {"shot.statsbomb_xg": "xGchain"}, inplace = True)
    return rdf

In [None]:
def xGchain1(df):
    noPen = df.loc[df["shot.type.name"] != "Penalty", :]
    a = noPen.groupby(["possession_team.id", "possession", "team.id", "player.id"])["id"]

In [None]:
def ratiodf(df):
    df = df.copy()
    if len([n for n in df.columns if "Against" in n]) > 10:
        forag = ["", ".Against"]
    else:
        forag = [""]
    
    for fa in forag:
        df.loc[:, "possession%"+fa] = df.loc[:, "possession"+fa] / df.loc[:, "possessions"]
        df.loc[:, "xG/shot"+fa] =  df.loc[:, "shot.statsbomb_xg"+fa] / df.loc[:, "type.Shot"+fa]
        df.loc[:, "NPxG/shot"+fa] =  df.loc[:, "shot.statsbomb_xg.Non Penalty"+fa] / df.loc[:, "shot.Non Penalty"+fa]
        df.loc[:, "aerialwin%"+fa] = df.loc[:, "aerialwin"+fa] / df.loc[:, df.columns.intersection(["duel.Aerial Lost"+fa, "aerialwin"+fa])].sum(axis=1)
        df.loc[:, "dribble%"+fa] = df.loc[:, "dribble.Complete"+fa] / df.loc[:, "type.Dribble"+fa]
        df.loc[:, "NPshot%"+fa] = df.loc[:, df.columns.intersection(["shot.Non Penalty.Goal"+fa, "shot.On T.No Goal"+fa])].sum(axis = 1) / df.loc[:, "shot.Non Penalty"+fa]
        df.loc[:, "pass%"+fa] = df.loc[:, "pass.Complete"+fa] / df.loc[:, "type.Pass"+fa]
        df.loc[:, "cross%"+fa] = df.loc[:, "pass.Cross.Complete"+fa] / df.loc[:, "pass.cross"+fa]
        df.loc[:, "shottouch%"+fa] = df.loc[:, "type.Shot"+fa] / df.loc[:, "touch"+fa]

    return df

In [None]:
def columnFlatten(cols, prefix ):
    fixDict = {"index": prefix, "True": "Cross", "False": "Non Cross"}
    new_cols = []
    for col in cols:
        col = [str(c) for c in col]
        for key in fixDict.keys():
            if key in col:
                col[col.index(key)] = fixDict[key]
        new_cols.append(".".join(col))
    return new_cols
        

In [None]:
def PossessionAdjust(df, cols):
    for col in cols:
        sigma = 2 / (1 + np.e ** (-10 * ((df.loc[:, "possession"] / df.loc[:, "possessions"]) - 0.5)))
        df.loc[:, col + ".PAdj"] = df.loc[:, col] * sigma
    return df

In [None]:
def fix(item):
    if isinstance(item, str):
        return np.array([item])
    elif type(item) == np.ndarray:
        return item

def PosStencil(position):
    if position == 1:
        return "goalkeeper"
    
    elif position in [2, 6]:
        return "fullback"
    elif position in [7, 8]:
        return "fullback"
    
    elif position in [3, 4, 5]:
        return "centreback"
    
    elif position in [9, 10, 11]:
        return "midfielder"
    elif position in [13, 14, 15]:
        return "midfielder" 
    
    elif position in [18, 19, 20]:
        return "winger"
    elif position in [12, 16]:
        return "winger"
    elif position in [17, 21]:
        return "winger"
    
    elif position == 25:
        return "striker"
    elif position in [22, 23, 24]:
        return "striker"
    

In [None]:
def templates():
    touch = []
    passing = ["pass%", "pass.xGbuildup", "Pass_progress1"]
    dribble = ["type.Foul Won", "Carry_progress1"]
    defence = ["type.Pressure", "type.Interception.PAdj", "duel.Tackle.PAdj", "type.Foul Committed"]
    crossing = ["cross%"]
    
    defensive_passing = ["type.Clearance"]
    
    shooting = ["NPxG/shot", "shot.statsbomb_xg.Non Penalty", "shot.Non Penalty"]
    attacking_touch = ["shottouch%", "touch.inthebox"]
    attacking_passing = ["pass.xA", "Pass_boxentry"]
    attacking_dribble = ["dribble.Complete", "Carry_boxentry"]
    attacking_aerial = ["aerialwin%"]
    
    aerial = ["aerialwin"] + attacking_aerial
    
    deepprogress = ["deepprogress"]
    
    
    
    stencils = {"striker":  ["shot.statsbomb_xg.Non Penalty", "shot.Non Penalty", "shottouch%", "touch.inthebox",
                             "pass.xA", "pressure_regain", "type.Pressure", "aerialwin", 
                             "turnover", "dribble.Complete", "NPxG/shot"],
                
               "winger": ["shot.statsbomb_xg.Non Penalty", "shot.Non Penalty", "touch.inthebox",
                          "pass%", "pass.xA", "Pass_boxentry", "Pass_progress1", 
                       "Carry_progress1", "dribble.Complete", "Carry_boxentry", "type.Foul Won", 
                          "turnover", "pressure_regain", "NPxG/shot"],
                
               "midfielder": ["pass%", "deepprogress", "pass.xA", "pass.xGbuildup", "Pass_progress1", "Pass_boxentry",
                               "Carry_progress1", "dribble.Complete", "type.Foul Won", "turnover", 
                               "pressure_regain", "type.Pressure", 
                                "type.Interception.PAdj", "duel.Tackle.PAdj"],
                
               "fullback": ["duel.Tackle.PAdj", "type.Interception.PAdj", "type.Pressure", "cross%",
                            "Pass_progress1", "deepprogress", "pass%", "pass.xGbuildup", "dribble.Complete", 
                            "turnover", "aerialwin", "type.Foul Committed"],
                
               "centreback": ["pass%", "type.Pressure", "type.Foul Committed", "duel.Tackle.PAdj",
                                "type.Interception.PAdj", "aerialwin", "aerialwin%", "type.Clearance", 
                               "pass.xGbuildup", "Pass_progress1"],
               "attacking": ["shot.statsbomb_xg.Non Penalty", "NPxG/shot", "shot.Non Penalty",
                            ],
               "defending": ["shot.statsbomb_xg.Non Penalty.Against", "NPxG/shot.Against", "shot.Non Penalty.Against",
                            "pass%.Against"]}   
    return stencils

In [None]:
def MatchSummary(matchdf, lineupdf, overdf, player_or_team, ratio = False):
    if player_or_team not in ["player", "team"]:
        return "fail"
    
    if player_or_team == "player":
        grouper = "player.id"
    elif player_or_team == "team":
        grouper = "team.id"
       
    overdict = overdf.loc[overdf["match_id"] == match_id, :].to_dict(orient = "records")[0]
    teams = {}
    teams[overdict["home_team.home_team_id"]] = overdict["home_team.home_team_name"]
    teams[overdict["away_team.away_team_id"]] = overdict["away_team.away_team_name"]

    ldf = lineupdf
    if player_or_team == "player":
        ldf = ldf.rename(columns = {"player_id": "player.id"})
        ldf.set_index("player.id", inplace = True)

    df = matchdf.loc[matchdf["period"] < 5, :].copy()
    df.loc[:, "touch.inthebox"] = df.loc[:, "touch"] * df.loc[:, "inthebox"]
    
    #xA DataFrame
    for col in ["pass.shot_assist", "pass.goal_assist"]:
        if col not in df.columns:
            df.loc[:, col] = np.NaN
    assists = df.loc[:, "pass.assisted_shot_id"].dropna()
    shots = df.loc[(df["id"]).isin(assists), ["id", "shot.statsbomb_xg", "shot.type.name"]].rename(columns = {"shot.statsbomb_xg": "pass.xA", "shot.type.name": "pass.assist.name"})
    df = df.merge(right = shots, left_on = "pass.assisted_shot_id", right_on = "id", how = "outer")

    grouped = df.groupby(grouper)
    
    conc_list = []
    
    for t in ["type.name", "dribble.outcome.name", "shot.type.name", "shot.outcome.name",
              "pass.outcome.name", "duel.type.name"]:
        pref = t.split(".")[0] 
        if pref == "shot":
            temp = df.groupby([grouper, t]).agg({"index": "count", "shot.statsbomb_xg": sum}).unstack()
            temp.columns = columnFlatten(temp.columns, prefix = pref)
        else:
            temp = grouped[t].value_counts().unstack().add_prefix(pref + ".")
        conc_list.append(temp)
    
    for t in [["pass.outcome.name", "pass.cross"], ["shot.outcome.name", "shot.type.name"],
             ["duel.outcome.name", "duel.type.name"]]:
        gb = [grouper] + t
        pref = t[0].split(".")[0]
        if pref == "shot":
            aggFunc = {"index": "count", "shot.statsbomb_xg": sum}
        else:
            aggFunc = {"index": "count"}
        temp = df.groupby(gb).agg(aggFunc).unstack().unstack()
        temp.columns = columnFlatten(temp.columns, prefix = pref)
        conc_list.append(temp) 
        
    #temp = df.groupby([grouper, "pass.assist.name"])["pass.xA"].sum().unstack()
    #temp.columns = ["pass.xA.{}".format(n) for n in temp.columns]
    #conc_list.append(temp)
    
    #time & possession DataFrame

    match_time = df.time.max()
    tlist = []
    for i in range(2):
        lineup = df.loc[i, ["tactics.lineup", "team.id"]]
        lu = pd.json_normalize(lineup["tactics.lineup"])
        lu.loc[:, "team.id"] = lineup["team.id"]
        tlist.append(lu)

    tdf = pd.concat(tlist)[["player.id", "team.id", "position.id"]]
    tdf.loc[:, "match.start"] = 1
    subs = df.loc[df["type.id"] == 19, ["time", "player.id", "team.id", "substitution.replacement.id"]]
    subs_off = subs.loc[:, ["time", "player.id", "team.id"]].rename(columns={"time": "time.end"})
    subs_on = subs.loc[:, ["time", "substitution.replacement.id", "team.id"]].rename(columns={"time": "time.start", "substitution.replacement.id": "player.id"})

    tdf = pd.concat([tdf, subs_on])
    tdf = tdf.merge(subs_off, how = "outer")

    tdf.fillna({"time.start": 0, "time.end": match_time, "match.start": 0, "position.id": 0}, inplace = True)
    Stencils = np.vectorize(PosStencil)
    tdf.loc[:, "stencil"] = Stencils(tdf.loc[:, "position.id"])

    if player_or_team == "player":
        tdf["checks"] = list(zip(*[tdf["time.start"], tdf["time.end"]]))

        for check in tdf.checks.unique():
            pdf = df.loc[(df["time"] >= check[0]) & (df["time"] <= check[1]), ["possession_team.id", "possession"]]
            gdf = pdf.groupby("possession_team.id")["possession"].nunique()
            tdf.loc[tdf["checks"] == check, "possessions"] = gdf.sum()
            for team in gdf.index:
                tdf.loc[(tdf["team.id"] == team) & (tdf["checks"] == check), "possession"] = gdf[team]
        tdf.drop(labels = ["team.id", "checks"], axis = 1, inplace = True)
        tdf.set_index(grouper, inplace = True)
        
    else:
        tdf = pd.DataFrame(grouped["possession"].nunique())
        tdf["possessions"] = sum(tdf["possession"])
        tdf["time.end"] = df["time"].max()  
        tdf["time.start"] = 0

    tdf.loc[:, "match.played"] = 1
    tdf.loc[:, "time.played"] = tdf.loc[:, "time.end"] - tdf.loc[:, "time.start"]
    
    #xGchain
    if player_or_team == "player":
        conc_list += [ldf, xGchain(df)]
        
    awcol = [n for n in df.columns if "aerial_won" in n]
    progcol = [n for n in df.columns if "progress" in n]
    becol = [n for n in df.columns if "boxentry" in n]
    toucol = ["touch", "touch.inthebox"]
    passcol = ["pass.xA", "pass.shot_assist", "pass.goal_assist"]
    sumcols = ["pressure_regain", "pass.cross", "shot.statsbomb_xg"] + awcol + progcol + passcol + becol + toucol
    pdf = grouped[sumcols].apply(lambda x: x.sum())
    
    conc_list += [tdf, pdf]
    
    sdf = pd.concat(conc_list, axis = 1)
    nofill = ["player_nickname", "jersey_number", "stencil"]
    fill = [n for n in sdf.columns if n not in nofill]
    sdf[fill] = sdf[fill].fillna(0)
    #sdf["position.name"] = sdf["position.name"].fillna("Unused Sub")
    
    ngotcols = [n for n in sdf.columns if all(i in n for i in ["shot", "Saved"]) and not all(j in n for j in ["statsbomb_xg", "Penalty"])]
    check_cols = ["shot.statsbomb_xg.Penalty", "shot.Penalty", "shot.Goal", "shot.Penalty.Goal"] + passcol + awcol
    for col in check_cols:       
        if col not in sdf.columns:
            sdf.loc[:, col] = 0
    
    sdf.loc[:, "shot.statsbomb_xg.Non Penalty"] = sdf.loc[:, "shot.statsbomb_xg"] - sdf.loc[:, "shot.statsbomb_xg.Penalty"]
    sdf.loc[:, "shot.Non Penalty"] = sdf.loc[:, "type.Shot"] - sdf.loc[:, "shot.Penalty"]
    sdf.loc[:, "shot.Non Penalty.Goal"] = sdf.loc[:, "shot.Goal"] - sdf.loc[:, "shot.Penalty.Goal"]
    sdf.loc[:, "shot.On T.Non Penalty"] = sdf.loc[:, ngotcols].sum(axis = 1)
    
    sdf.loc[:, "progress1"] = sdf.loc[:, ["Pass_progress1", "Carry_progress1"]].sum(axis = 1)
    sdf.loc[:, "deepprogress"] = sdf.loc[:, ["Pass_deepprogress", "Carry_deepprogress"]].sum(axis = 1)
    sdf.loc[:, "boxentry"] = sdf.loc[:, ["Pass_boxentry", "Carry_boxentry"]].sum(axis = 1)
    sdf.loc[:, "turnover"] = sdf.loc[:, ["type.Miscontrol", "dribble.Incomplete"]].sum(axis = 1)
    
    sdf.loc[:, "aerialwin"] = sdf.loc[:, sdf.columns.intersection(awcol)].sum(axis = 1)  
    
    sdf = PossessionAdjust(sdf, ["type.Interception", "duel.Tackle"])
      
    if player_or_team == "team":
        sdf.loc[:, "opposition.id"] = [sdf.index[i] for i in [1, 0]]
        sdf.loc[:, "opposition.name"] = [teams[i] for i in sdf.loc[:, "opposition.id"].values.tolist()]  
        sdf.loc[:, "team.name"] = [teams[i] for i in sdf.index]
        
        adf = sdf.groupby("opposition.id").sum().add_suffix(".Against")
        
        simdf = simulate_match2(df, iterations = 10000)
        sdf = pd.concat([sdf, adf, simdf], axis = 1)
        
        
    else:
        xgb = (sdf.loc[:, "xGchain"] - sdf.loc[:, "shot.statsbomb_xg.Non Penalty"] - sdf.loc[:, "pass.xA"]).values   
        sdf.loc[:, "pass.xGbuildup"] = [max(0, n) for n in xgb]
        
    for col in ["match_id", "match_date", "competition.competition_id", "competition.country_name", 
                "competition.competition_name", "season.season_id", "season.season_name"]:
        sdf.loc[:, col] = overdict[col]  
    return sdf
        

In [None]:
def per90(df):
    spare = ["Name", "team.name", "competition.competition_name", "season.season_name", "time.played", 
             "time.start", "time.end"]
    cols = [n for n in df.columns if n not in spare]
    for col in cols:
        df[col] = df[col] * 90 / (df["time.played"])
    return df

In [None]:
unsums = ["player_id", "player_name", "player_nickname", "jersey_number", "country.id", "country.name", "team.name", 
          "team.id", "Name", "shortName", "player.id", "time.start", "time.end", "position.name",
          "opponent.id", "opponent.name", "match_id", "match_date", "competition.competition_id", 
          "competition.competition_name", "competition.country_name", "season.season_id", "season.season_name"]

test1 = allmatchesDataFrame["home_team.home_team_gender"] == "female"
test2 = allmatchesDataFrame["competition.competition_name"] == "NWSL"
test3 = allmatchesDataFrame["competition.competition_name"] == "FIFA World Cup"
test4 = allmatchesDataFrame["competition.competition_name"] == "Premier League"
match_ids = allmatchesDataFrame.loc[test1 | test4, "match_id"].values.tolist()
Summary = {"player": [], "team": []}
 

tList = [time.time()]
for match_id in match_ids:
    path = os.path.join(events, "{}.json".format(match_id))
    
    temp = json_dataFrame(path)
    ldf = LineupDataFrame(match_id)
    
    for pot in ["player", "team"]:
        thedf = MatchSummary(matchdf = temp, lineupdf = ldf, player_or_team = pot, overdf = allmatchesDataFrame)
        Summary[pot].append(thedf)
    
    tList.append(time.time())
    print("{} of {}; {}s".format(match_ids.index(match_id) + 1, len(match_ids), round(tList[-1] - tList[-2], 2)))



In [None]:
Summaries = {}
Summarised = {}
gb1 = ["team.name", "competition.competition_name", "season.season_name"]
for pot in ["player", "team"]:
    Summaries[pot] = pd.concat(Summary[pot])
    sums = [n for n in Summaries[pot].columns if n not in unsums]
    
    if pot == "player":
        gb = ["Name"] + gb1
        spgb = Summaries[pot].groupby(gb)
        adf = spgb[sums].sum()
          
        posdf = spgb["stencil"].value_counts(normalize = True).unstack()

        gdf = pd.concat([adf, posdf], axis = 1)
    else:
        gb = gb1
        spgb = Summaries[pot].groupby(gb)
        gdf = spgb[sums].sum()

 
    Summarised[pot] = ratiodf(gdf).reset_index()

In [None]:
def Bar(df, stats, bar_style, stat_type = "", minutes = 0):
    bardf = df.copy()
    
    fig, ax = plt.subplots(figsize = (9, 12))
 
    for side in ["top", "bottom",  "right"]:
        ax.spines[side].set_visible(False) 
        
    if stat_type == "per90":
        for stat in stats:
            bardf.loc[:, "{} {}".format(stat, stat_type)] = bardf.loc[:, stat] * 90 / bardf.loc[:, "time.played"]
    elif stat_type == "perMatch":
        for stat in stats:
            bardf.loc[:, "{} {}".format(stat, stat_type)] = bardf.loc[:, stat]/bardf.loc[:, "match.played"]
            
    stats = [("{} {}".format(n, stat_type)).strip() for n in stats]
    if bar_style == "plus":
        bardf.loc[:, "stats"] = bardf.loc[:, stats].sum(axis = 1)
        bardf.loc[:, "zeroes"] = 0
    elif bar_style == "minus":
        bardf.loc[:, "stats"] = bardf.loc[:, stats[0]] - bardf.loc[:, stats[1]]
        
    gdf = bardf.loc[bardf["time.played"] > minutes, :]
    
    cvalue = min([30, len(gdf)])
    cdf = gdf.sort_values("stats", ascending = False)[:cvalue]
    
    tests, sub, lab = {}, [],  []
    cols = ["Name", "team.name", "competition.competition_name", "season.season_name"]
    for col in cols:
        if col in cdf.columns:
            tests[col] = cdf.loc[:, col].unique()
            if len(tests[col]) == 1:
                sub.append(tests[col][0])
            else:
                lab.append(col)
         
    if len(lab) == 0:
        cdf["labels"] = list(zip(*[cdf[n] for n in cols if n not in ["Name", "team.name"]]))
    else:
        cdf["labels"] = list(zip(*[cdf[n] for n in lab]))
      
    y = np.arange(len(cdf), 0, -1)
    if bar_style == "plus":
        x = [cdf.loc[:, "zeroes"]] + [cdf.loc[:, stat ] for stat in stats]
        for i in range(len(stats)):
            ax.barh(y = y, width = x[i+1], height = 0.5, left = sum(x[:(i+1)]), 
                     tick_label = cdf["labels"], label = stats[i])  
    elif bar_style == "minus":
        x1 = cdf.loc[:, stats[0]]
        x2 = cdf.loc[:, stats[1]]        
        
        adj = 0.2
        ax.barh(y = y + adj, width = x1, height = adj * 2 - 0.05,
                     tick_label = cdf["labels"], label = stats[0])
        ax.barh(y = y - adj, width = x2, height = adj * 2 - 0.05,
                     tick_label = cdf["labels"], label = stats[1])
        

    x, new_y = cdf.stats.max()/2, max(y) + 1
    if minutes > 0:
        new_y += 0.5
        ax.text(x = x, y = new_y, s = "Minimum: {} minutes played".format(minutes), horizontalalignment = "center")
    if len(sub) > 0:
        new_y += 0.5
        ax.text(x = x, y = new_y, s = ", ".join(sub), horizontalalignment = "center")

    title = " {} ".format(bar_style)
    ax.text(x = x, y = new_y + 0.5 , s = title.join(stats), 
             horizontalalignment = "center", fontsize = 12)
    
    ax.grid(True)
    ax.legend(loc = "lower right")
    fig.tight_layout()
    
    return fig, ax

In [None]:
def Scatter(df, stats, quintile, stat_type = "", minutes = 0):
    fig, ax = plt.subplots(figsize = (10, 10)) 
    
    scatterdf = df.copy()

    for side in ["top", "bottom",  "right", "left"]:
        ax.spines[side].set_visible(False) 
            
    for stat in stats:
        if any([n in stat for n in ["%", "/"]]):
            scatterdf.loc[:, "{} {}".format(stat, stat_type)] = scatterdf.loc[:, stat]
        else:
            if stat_type == "per90":
                scatterdf.loc[:, "{} {}".format(stat, stat_type)] = scatterdf.loc[:, stat] * 90 / scatterdf.loc[:, "time.played"]
            elif stat_type == "perMatch":
                scatterdf.loc[:, "{} {}".format(stat, stat_type)] = scatterdf.loc[:, stat]/scatterdf.loc[:, "match.played"]
            
            
    stats = [("{} {}".format(n, stat_type)).strip() for n in stats]
    scatterdf["stats"] = scatterdf[stats[0]]/scatterdf[stats[0]].max() + scatterdf[stats[1]]/scatterdf[stats[1]].max()

    gdf = scatterdf.loc[df["time.played"] >= minutes].copy()
    
    tests, sub, lab = {}, [],  []  
    cols = ["team.name", "competition.competition_name", "season.season_name"]
    if pot == "player":
        cols = ["Name"] + cols
    for col in cols:
        tests[col] = gdf.loc[:, col].unique()
        if len(tests[col]) == 1:
            sub.append(tests[col][0])
        else:
            if col not in ["Name", "team.name"]:
                lab.append(col)

    if len(lab) == 0:
        gdf["labels"] = list(zip(*[gdf[n] for n in cols if n not in ["Name", "team.name"]]))
    else:
        gdf["labels"] = list(zip(*[gdf[n] for n in lab]))

    for label in gdf["labels"].unique():
        x = gdf.loc[gdf["labels"] == label, stats[0]]
        y = gdf.loc[gdf["labels"] == label, stats[1]]
        ax.scatter(x = x, y = y, s = 30, label = label, edgecolor = "k", alpha = 0.8, linewidth = 0.5)
    
    q = quintile
    x = gdf.loc[:, stats[0]]
    y = gdf.loc[:, stats[1]]

    adf = gdf.loc[(gdf[stats[0]] > x.quantile(q))|(gdf[stats[1]] > y.quantile(q))|(gdf["stats"] > gdf["stats"].quantile(q)), :]
    if "Name" in scatterdf.columns:
        names = adf.loc[:, "Name"].apply(ShortName)
    else:
        names = adf.loc[:, "team.name"]
    for i, name in enumerate(names):
        xa = adf.loc[:, stats[0]].values[i] + x.max()/100
        ya = adf.loc[:, stats[1]].values[i] - y.max()/60
        ax.annotate(s = name, xy = (xa, ya))
    
    ax.axvline(c = "k")
    ax.axhline(c = "k")
        
    ax.set_xlabel(stats[0])
    ax.set_ylabel(stats[1])
    
    x, new_y, elev = x.max()/2, max(y) * 1.1, max(y)/25
    if minutes > 0:
        new_y += elev
        ax.text(x = x, y = new_y, s = "Minimum: {} minutes played".format(minutes), horizontalalignment = "center")
    if len(sub) > 0:
        new_y += elev
        ax.text(x = x, y = new_y, s = ", ".join(sub), horizontalalignment = "center")

    ax.text(x = x, y = new_y + elev , s = " against ".join(stats), horizontalalignment = "center", fontsize = 12)
    
    ax.grid(True)
    ax.legend(loc = "upper right")
    fig.tight_layout()
    
    return fig, ax

In [None]:
mins = 200

In [None]:
statList = [["pass.xA", "shot.statsbomb_xg.Non Penalty"], ["pass.xGbuildup", "shot.statsbomb_xg.Non Penalty"], 
            ["shot.Non Penalty", "NPxG/shot"], ["type.Foul Won", "dribble.Complete"],
           ["type.Interception.PAdj", "duel.Tackle.PAdj"]]
for sl in statList:
    fig, ax = Scatter(df = Summarised["player"], stats = sl, stat_type = "per90",
                           quintile = 0.95, minutes = mins)
    plt.show()

In [None]:
statList = [["shot.statsbomb_xg.Non Penalty", "pass.xA"], ["Pass_progress1", "Carry_progress1"], 
            ["shot.statsbomb_xg.Non Penalty", "pass.xA", "pass.xGbuildup"],["shot.Non Penalty.Goal", "pass.goal_assist"],
            ["pass.shot_assist", "pass.goal_assist"],["type.Pressure"], ["pressure_regain"], 
            ["type.Miscontrol", "dribble.Incomplete"], ["type.Interception.PAdj", "duel.Tackle.PAdj"]]

for sl in statList:
    fig, ax = Bar(df = Summarised["player"], stats = sl, 
                       bar_style = "plus", stat_type = "per90", minutes = mins)
    plt.show()


In [None]:
statList = [["shot.Non Penalty.Goal", "shot.Non Penalty.Goal.Against"],
           ["shot.statsbomb_xg.Non Penalty", "shot.statsbomb_xg.Non Penalty.Against"]]

for sl in statList:
    fig, ax = Bar(df = Summarised["team"], stats = sl, bar_style = "minus", 
                       stat_type = "perMatch", minutes = 0)
    plt.show()

In [None]:
df = json_dataFrame("{}\\{}.json".format(events, 18243))
simulate_match2(df)

In [None]:
statList = [["xpoints"]]

for sl in statList:
    fig, ax = Bar(df = Summarised["team"], stats = sl, bar_style = "plus", 
                       stat_type = "perMatch", minutes = 0)
    plt.show()

In [None]:
statList = [["shot.statsbomb_xg.Non Penalty.Against", "shot.statsbomb_xg.Non Penalty"], 
            ["type.Pressure.Against","type.Pressure"],
           ["shot.Non Penalty", "NPxG/shot"],
           ["deepprogress.Against", "deepprogress"]]
for sl in statList:
    fig, ax = Scatter(df = Summarised["team"], stats = sl, stat_type = "perMatch", quintile = 0 , minutes = 0)
    plt.show()

In [None]:
def Radar(row, **kwargs):
    
    stencils = templates()
    if "stencil" in kwargs.keys():
        stencil = stencils[kwargs["stencil"].lower()]
        
    fig = plt.figure(figsize = (8, 8))
    ax = fig.add_subplot(111, projection='polar') 
    cmap = cm.get_cmap("RdYlGn")
    
    N = len(stencil)
    theta = np.linspace(0, 2 * math.pi, N, endpoint = False)
    width = math.pi  * 2 / (N + 1)
    
    radii = row[stencil].astype("float64")
    colors = cmap(radii)
    
    ax.set_theta_zero_location("N")
    ax.spines['polar'].set_visible(True)

    ax.set_ylim([0, 1])
    ax.bar(theta, radii, width=width*0.95, bottom=0.0, alpha=0.4, 
            edgecolor = "black", color = colors, tick_label = stencil)
    
    ytext = [1.2] + [1.2 + 0.1*n for n in range(5)]
    
    ax.text(x = math.pi, y = ytext[2], 
            s = "Proportion of games played in {} position(s): {}%".format(kwargs["stencil"], round(100*row[kwargs["stencil"]]), 2), 
            horizontalalignment = "center", fontsize = 12)
    ax.text(x = math.pi, y = ytext[0], s = "plot type: {}".format(kwargs["stencil"]), 
            horizontalalignment = "center", fontsize = 12)

    timetext = "Minutes: {}".format( int(round(row["time.played"], 0)) )
    ax.text(x = 0, y = ytext[1], s = timetext, horizontalalignment = "center", fontsize = 12)
    ax.text(x = 0, y = ytext[2], s = row["season.season_name"], horizontalalignment = "center", fontsize = 12)
    ax.text(x = 0, y = ytext[3], s = row["competition.competition_name"], horizontalalignment = "center", fontsize = 12)
    ax.text(x = 0, y = ytext[4], s = row["team.name"], horizontalalignment = "center", fontsize = 12)
    ax.text(x = 0, y = ytext[5], s = row["Name"], horizontalalignment = "center", fontsize = 14)
        
    ax.grid(True)
    #newax = fig.add_axes([1, 0.8, 0.2, 0.2], zorder=-1)
    #newax.imshow(logo)
    #newax.axis('off')

    return fig, ax
    

In [None]:
pldf = Summarised["player"].loc[Summarised["player"]["competition.competition_name"] == "Premier League", :]
pldf.loc[pldf["time.played"] > 500, :]

In [None]:
stencils = templates()

player = ""
stencil_choice = "winger"
min_pos = 0.3
min_minutes = 500

if player != "":
    sdf  = Summarised["player"].loc[(Summarised["player"]["time.played"] >= min_minutes), :].copy()
elif stencil_choice != "":
    sdf = Summarised["player"].loc[(Summarised["player"]["time.played"] >= min_minutes) & 
                                   (Summarised["player"][stencil_choice] >= min_pos) , :].copy()

    
idx = ["Name", "team.name", "competition.competition_name", "season.season_name", "time.played"]
stencil_keys = [n for n in stencils.keys() if n not in ["attacking", "defending"]]
pccols = [n for n in sdf.columns if "%" in n or "/" in n]

ndf = sdf[idx + stencil_keys].copy()
rdf = sdf[idx + stencil_keys].copy()
for col in [n for n in sdf.select_dtypes(include=np.number).columns if n != "time.played" and n not in stencil_keys]:
    if col in pccols:
        ndf[col] = sdf[col]
    else:
        ndf[col] = 90 * sdf[col] / sdf["time.played"]
    if col in ["turnover", "type.Foul Committed"]:
        rdf[col] = ndf[col].rank(pct = True, ascending = False)
    else:
        rdf[col] = ndf[col].rank(pct = True)

    

if player != "":
    radf = rdf.loc[rdf["Name"] == player, :]
            
elif stencil_choice != "":
    best = "best4"
    sten = stencils[stencil_choice]
    rdf["best1"] = rdf[sten].mean(axis = 1)
    rdf["best2"] = rdf[sten].median(axis = 1)
    rdf["best3"] = rdf[sten].quantile(0.25, axis = 1)
    rdf["best4"] = rdf[sten].apply(lambda x: x.nlargest(n = 5).mean(), axis = 1)
    radf = rdf.loc[rdf[stencil_choice] >= min_pos, :].sort_values(best, ascending = False).head()
    
for i, row in radf.iterrows():
    stencil_range = []
    if player != "":
        for key in stencil_keys:
            if row[key] >= min_pos:
                stencil_range.append(key)
                
    elif stencil_choice != "":
        stencil_range.append(stencil_choice)
    
    for piece in stencil_range:
        #print("Proportion of games played as {}:".format(piece), "{}%".format(round(100*row[piece]), 2))
        fig, ax = Radar(row, stencil = piece)
        
        sten = stencils[piece]
        radii = row[sten]
        N = len(sten)
        
        stats = pd.DataFrame()
        stats["value"] = [round(n, 2) for n in ndf.loc[i, sten].tolist()]
        stats["rank"] = [round(100 * n, 2) for n in radii.tolist()]
        stats.index = sten

        plt.show()
        
        print(stats)
        
        if player != "":
            rdf["diff1"] = (((rdf[sten] - radii) ** 2).sum(axis = 1)/N) ** (1/2)
            rdf["diff2"] = ((rdf[sten] - radii).abs().sum(axis = 1))/N
            diff = "diff2"
            cdf = rdf.loc[(rdf.Name != player) & (rdf[piece] >= min_pos), :].sort_values(diff)
            

            for j, row1 in cdf.head().iterrows():
                
                fig, ax = Radar(row1, stencil = piece)
                plt.show()
                
                stats = pd.DataFrame()
                stats["value"] = [round(n, 2) for n in ndf.loc[j, sten].tolist()]
                stats["rank"] = [round(100 * n, 2) for n in row1[sten].tolist()]
                stats.index = sten

                plt.show()

                print(stats)
                print("Stat difference: {}".format((round(100 * row1[diff], 2))))
        
        elif stencil_choice != "":
            best_text = "Rank Average: {}%".format(round(100 * row[best], 2))
            print(best_text)

            
    



In [None]:
def Rolling(name, df, stats, roll = 5):
    if name in df["team.name"].unique():
        rdf = df.loc[df["team.name"] == name, :].copy()
    elif name in df["Name"].unique():
        rdf = df.loc[df["Name"] == name, :].copy()
    else:
        return "fail", "fail"
    
    rdf = rdf.sort_values("match_date")
    
    seasondf = pd.DataFrame()
    seasons = rdf.groupby("season.season_name")
    seasondf["start"] = seasons["match_date"].first()
    seasondf["end"] = seasons["match_date"].last()
    
    if len(rdf) < roll * 2:
        return "fail", "fail"
    else:
        rolldf = rdf.loc[:,  stats].rolling(roll).mean()
        rolldf.loc[:, "match_date"] = rdf.loc[:, "match_date"]
        for i in range(len(seasondf)):
            test = (rolldf["match_date"] >= seasondf["start"].tolist()[i]) & (rolldf["match_date"] <= seasondf["end"].tolist()[i])
            rolldf.loc[test, "match_no"] = range(1, sum(test) + 1)
            rolldf.loc[test, "season.season_name"] = seasondf.index[i]
        rolldf.index = range(1, len(rolldf) + 1)  
        
        fig, ax = plt.subplots(figsize = (10, 5))

        for side in ["top", "bottom", "left", "right"]:
            ax.spines[side].set_visible(False)

        x = rolldf.index
        y1 = rolldf[stats[0]]
        y2 = rolldf[stats[1]]

        ax.plot(x, y1, label = stats[0])
        ax.plot(x, y2, label = stats[1])

        ax.fill_between(x, y1, y2, alpha = 0.4, where = y1 > y2, interpolate = True)
        ax.fill_between(x, y2, y1, alpha = 0.4, where = y2 > y1, interpolate = True)

        ax.grid(True)
        ax.axhline(0, c="k")

        vlines = rolldf.loc[rolldf["match_no"] == 1, :].index
        for vline in vlines:

            ax.axvline(vline - 0.4, c = "C2")
            ax.text(x = vline, y = ax.get_ylim()[1] * 0.95, s = rolldf.loc[vline, "season.season_name"])

        ax.legend(loc = 4)

        xlocs = x.tolist()[::roll]
        xlabs = rolldf["match_no"].tolist()[::roll]
        ax.set_xticks(xlocs)
        ax.set_xticklabels(xlabs)

        ax.text(x = ax.get_xlim()[1]/2, y = 0.3 + ax.get_ylim()[1], 
                s = "{} Rolling {} Game Average".format(name, roll), 
                horizontalalignment = "center", fontsize = 14)
        
        return fig, ax

In [None]:
tdf = Summaries["team"].sort_values("match_date").reset_index()
tdf.loc[:, "NPxGD"] = tdf.loc[:, "shot.statsbomb_xg.Non Penalty"] - tdf.loc[:, "shot.statsbomb_xg.Non Penalty.Against"]
tdf.loc[:, "NPGoalD"] = tdf.loc[:, "shot.Non Penalty.Goal"] - tdf.loc[:, "shot.Non Penalty.Goal.Against"]
stats = [["shot.statsbomb_xg.Non Penalty", "shot.statsbomb_xg.Non Penalty.Against"],
        ["NPxGD", "NPGoalD"]]

for team in tdf["team.name"].unique():
    for stat in stats:
        fig, ax = Rolling(name = team, df = tdf, stats = stat)
        if isinstance(fig, str):
            continue
        else:
            plt.show()

In [None]:
pdf = Summaries["player"]
stats = [["shot.statsbomb_xg.Non Penalty", "shot.Non Penalty.Goal"]]

player = "Thierry Henry"

for stat in stats:
    fig, ax = Rolling(name = player, df = pdf, stats = stat)
    if isinstance(fig, str):
        continue
    else:
        plt.show()