Prerequisites

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from collections import defaultdict
from matplotlib.patches import Arc
import seaborn as sns
import numpy as np
import json

Reading the data and doing some analysis

In [None]:
EVENT_DATA_PATH = '' # insert path here
ADV_STATS_DATA_PATH = ''  # insert path here


df_adv_stats = pd.read_csv(ADV_STATS_DATA_PATH)
df_event = pd.read_csv(EVENT_DATA_PATH)

In [None]:
df_adv_stats.value_counts()

In [None]:
df_adv_stats.shape

In [None]:
df_adv_stats.head()

In [None]:
df_adv_stats.tail()

In [None]:
df_adv_stats.isnull().sum()

Here we notice that the 'roundId' column in the dataframe has NaN values across the board so we remove it as it is not necessary going forward with the analysis

In [None]:
df_adv_stats_modif = df_adv_stats.drop(['roundId'], axis = 1)

The next step was to build a correlation matrix to get a better overview of the features that are in a strongly correlated in order to select a certain number of features that will help with the defensive analysis.

I have set a threshold of 0.6 as we are working with a lot of data and it can potentially capture useful information

In [None]:
corrMatrix = df_adv_stats_modif.corr(numeric_only = True)
threshold = 0.6
correlationsMappings = defaultdict(list)

for col1 in corrMatrix:
  for col2 in corrMatrix:
    if col1 != col2 and corrMatrix[col1][col2] > threshold:
      correlationsMappings[col1].append(col2)

The project relies on data analysis, precisely defensive data analysis for each team so for the moment these are the selected features that I think are the most important (also doing a sanity check to see if there are some noisy or irrelevant values in the dataset)

In [None]:
selectedStatsFeatures = df_adv_stats_modif[
    ['team.name', 'teamId', 'total.concededGoals', 'total.successfulDefensiveActions', 'total.losses', 'total.dangerousOwnHalfLosses', 'total.interceptions', 'total.recoveries', 'total.duels', 'total.pressingDuels', 'percent.defensiveDuelsWon', 'average.opponentOffsides']
]

for ft in selectedStatsFeatures:
   for idx, vals in enumerate(df_adv_stats_modif[ft]):
    if ft != 'team.name': assert vals >= 0, f'Invalid value on row {idx} for feature {ft}'

Normalizing the team names by removing diacrititcs so that it is easier to find them if necessary

In [None]:
selectedStatsFeatures['team.name'] = selectedStatsFeatures['team.name'].apply(lambda text: text.replace("ţ","t").replace("ş","s").replace("â","a").replace("ă","a").replace("î","i"))

Introducing 4 new metrics:
 - Team defensive error rate: It takes into account the number of goals that a team conceded and the number of total successful actions across the season: if the error is high, the team is inneficient in defense and can be forced into mistakes. On the other hand, if the error is low, the team is efficient in defense and the mistakes are very rare with them

 - Dangerous own half losses rate: This metric tells us the percentage of dangerous own half losses a team had and also, combined with the defensive error rate metric, it can be deducted a team's performance in transition: for example, if a team has a high dangerous own half loss rate and a low defensive error rate, it means that the respective team is good in transition

 - Intercept recovery rate: This metric allows us to see the percentage of ball recoveries made through interceptions only. If a team has a high intercept recovery rate, thus the team pressure needs to be taken into account by the opponent for example. Contrary, if a team has a low intercept recovery rate, the opponent can use link up plays and make them vulnerable.

 - Pressing duels intensity rate: This metric can tell us how many duels were created through pressing, giving an overview of which team is a high pressure one and which team is a laid back patient one.

In [None]:
selectedStatsFeatures['ownHalfLossRate'] = round((selectedStatsFeatures['total.dangerousOwnHalfLosses'] / selectedStatsFeatures['total.losses']) * 100, 2)
selectedStatsFeatures['defensiveErrorRates'] = (selectedStatsFeatures['total.concededGoals'] / selectedStatsFeatures['total.successfulDefensiveActions'])
selectedStatsFeatures['interceptionRecoveryRate'] = round((selectedStatsFeatures['total.interceptions'] / selectedStatsFeatures['total.recoveries']) * 100, 2)
selectedStatsFeatures['pressingIntensityRate'] = round((selectedStatsFeatures['total.pressingDuels'] / selectedStatsFeatures['total.duels']) * 100, 2)

The next step for the moment is to try and create team profiles based on the insights and metrics obtained.

In [None]:
def generateTeamProfiles():
  leagueAvg_ownHalf_Loss_Rate = selectedStatsFeatures['ownHalfLossRate'].mean()
  leagueAvg_Def_Errors = round(selectedStatsFeatures['defensiveErrorRates'].mean(), 2)
  leagueAvg_Interception_Recovery_Rate = selectedStatsFeatures['interceptionRecoveryRate'].mean()
  leagueAvg_Def_Duels_Won = selectedStatsFeatures['percent.defensiveDuelsWon'].mean()
  leagueAvg_Pressing_Intensity = selectedStatsFeatures['pressingIntensityRate'].mean()
  allNotes = []

  for(ownHalf_Loss_Rate, defError_Rate, def_duelsWon_Rate, intercept_Recovery_Rate, pressingIntensity_Rate) in zip(selectedStatsFeatures['ownHalfLossRate'], selectedStatsFeatures['defensiveErrorRates'], selectedStatsFeatures['percent.defensiveDuelsWon'], selectedStatsFeatures['interceptionRecoveryRate'], selectedStatsFeatures['pressingIntensityRate']):
    team_profile_notes = []

    if defError_Rate > leagueAvg_Def_Errors:
      team_profile_notes.append("This team concedes goals in spite of their succesful defensive actions. It can be attempted to force them to make mistakes" + '\n')

    else:
      team_profile_notes.append(f"This team is not defensive error prone. They have an error rate of {defError_Rate}% and mistakes are rare with them" + '\n')

    if ownHalf_Loss_Rate > leagueAvg_ownHalf_Loss_Rate:
      team_profile_notes.append(f"This team tends to lose the ball frequently in its own half, with a loss percentage of {ownHalf_Loss_Rate}% which is above the league average" + '\n')

    else:
      team_profile_notes.append(f"This team does not lose the ball that often in its own half, with a loss percentage of {ownHalf_Loss_Rate}%, proving that the team has a high control in its own half" + '\n')

    if def_duelsWon_Rate > leagueAvg_Def_Duels_Won:
      team_profile_notes.append(f"This team is very good in 1 on 1 defensive situations, with a {def_duelsWon_Rate}% of won duels" + '\n')

    else:
      team_profile_notes.append(f"This team is below the league average on 1 on 1 defensive situations, they can be vulnerable there" + '\n')

    if intercept_Recovery_Rate > leagueAvg_Interception_Recovery_Rate:
      team_profile_notes.append("This team performs well when recovering the posssesion by interceptions" + '\n')

    else:
      team_profile_notes.append(f"This team is below the league average when recovering the posssesion by interceptions, with {intercept_Recovery_Rate}%. They can be vulnerable against successful link up plays" + '\n')

    if pressingIntensity_Rate > leagueAvg_Pressing_Intensity:
      team_profile_notes.append(f"This team is a high pressing one. Watch out for their intensity and try to not lose the ball. Try to force them into a mistake and catch them off guard" + '\n')

    else:
      team_profile_notes.append(f"This team is a laid back one in terms of pressing. They will be patient when recovering the ball" + '\n')

    allNotes.append(" ,".join(team_profile_notes))

  selectedStatsFeatures['teamProfile'] = allNotes

In [None]:
generateTeamProfiles()

In [None]:
selectedStatsFeatures.to_csv('Team_DefenseMetrics_Profiles.csv', index = False)

Defining a plotting method in order to get a first visualization of our obtained results

In [None]:
def plotTeamsData(teams, plottingValues, plotTitle, yLabel):
    plt.figure(figsize=(10, 5))
    plt.bar(teams, plottingValues, color='blue', width=0.5)
    plt.xticks(rotation=55, ha='right', fontsize=10)
    plt.title(plotTitle)
    plt.ylabel(yLabel)
    plt.tight_layout()
    plt.show()

In [None]:
teams = selectedStatsFeatures['team.name']
plotTeamsData(teams, selectedStatsFeatures['ownHalfLossRate'], 'Own half loss rate per team', 'Own half loss rates (%)')
plotTeamsData(teams, selectedStatsFeatures['interceptionRecoveryRate'], 'Interception recovery rates per team', 'Interception recovery rates(%)')
plotTeamsData(teams, selectedStatsFeatures['pressingIntensityRate'], 'Pressing intensity rate per team', 'Pressing intensity rate(%)')

Now, let's also look at the event data for the team Dinamo Bucuresti and try to observe the defense plays.

In [None]:
df_event.shape

In [None]:
df_event.head()

In [None]:
df_event.tail()

Knowing that this project is focused on analyzing the defensive phase of the match, we filter the dataframe so that all the events are centralized around Dinamo Bucuresti's defensive moments. As a result, the dataframe is filtered by the opponent team name to be "Dinamo Bucuresti"

Afterwards, in the same manner as the advanced stats dataframe feature selection, we have selected a number of features that would be helpful in the event data analysis.

In [None]:
oppPlaysFilter = df_event['opponentTeam.name'] == "Dinamo Bucureşti"

filteredEventDf = df_event[oppPlaysFilter]
selectedEventFeatures = filteredEventDf[
    ['id', 'matchId', 'type.primary', 'type.secondary', 'location.x', 'location.y', 'player.name', 'team.name', 'label', 'points']
]

selectedEventFeatures

Normalizing the team names once again

In [None]:
selectedEventFeatures['team.name'] = selectedEventFeatures['team.name'].apply(lambda text: text.replace("ţ","t").replace("ş","s").replace("ă","a"))
selectedEventFeatures['label'] = selectedEventFeatures['label'].apply(lambda text: text.replace("ţ","t").replace("ş","s").replace("ă","a"))

Mapping each team to the match results

In [None]:
allMatchesResultsMapping = defaultdict(list)
allTeams = set(selectedEventFeatures['team.name'])
allMatches = set(selectedEventFeatures['label'])

for team in allTeams:
  for mtch in allMatches:
    if team in mtch: allMatchesResultsMapping[team].append(mtch)

Saving this dictionary as a json file so that it can be easily used locally

In [None]:
with open('matchResultsMappings.json', 'w') as fp:
    json.dump(allMatchesResultsMapping, fp)

Visualizing the number of points Dinamo Bucuresti has obtained against each team in the league. After visualization we find out that FCSB is Dinamo's best opponent

In [None]:
allPoints = []

for results in allMatchesResultsMapping.values():
  points_Against_Current_Team = 0

  for matchResult in results:
    matchResLoc = selectedEventFeatures.loc[selectedEventFeatures['label'] == matchResult]
    points_Against_Current_Team += int(matchResLoc['points'].iloc[0])
    continue

  allPoints.append(points_Against_Current_Team)


plotTeamsData(allMatchesResultsMapping.keys(), allPoints, 'Dinamo Bucuresti points against each team in the league', 'Points')

Next, let's see the events that lead to ball turnovers and ball recoveries from each team. This helps the analysis by looking at weak areas when the ball is turned over by the opponent and strong areas when the ball is recovered

In [None]:
def get_Turnover_Recovery_Data(matchResultData):
  for team in matchResultData:
    opponentBallTurnover_Events = {}
    opponentBallRecovery_Events = {}

    for result in matchResultData[team]:
      matchEvents = selectedEventFeatures.loc[selectedEventFeatures['label'] == result]

      for primaryEvent, secondaryEvents in zip(matchEvents['type.primary'], matchEvents['type.secondary']):
        if 'loss' in secondaryEvents:
          if primaryEvent not in opponentBallTurnover_Events: opponentBallTurnover_Events[primaryEvent] = 1
          else: opponentBallTurnover_Events[primaryEvent] += 1

        if 'loss' not in secondaryEvents and 'recovery' in secondaryEvents:
          if primaryEvent not in opponentBallRecovery_Events: opponentBallRecovery_Events[primaryEvent] = 1
          else: opponentBallRecovery_Events[primaryEvent] += 1

    plotTeamsData(opponentBallTurnover_Events.keys(), opponentBallTurnover_Events.values(), f'{team} ball turnover events against Dinamo Bucuresti across all matches played', 'Ball turnover events')
    plotTeamsData(opponentBallRecovery_Events.keys(), opponentBallRecovery_Events.values(), f'{team} ball recovery against Dinamo Bucuresti across all matches played', 'Ball recovery events')

In [None]:
get_Turnover_Recovery_Data(allMatchesResultsMapping)


To get a even better overview, a heat map of ball turnovers and ball recoveries is created for each team

In [None]:
def get_Loss_Recovery_Coordinates(team_Name):
  team_Events = selectedEventFeatures['team.name'] == team_Name
  filtered_Events = selectedEventFeatures[team_Events][['location.x', 'location.y', 'type.secondary']]
  X_loss = []
  Y_loss = []
  X_recovery = []
  Y_recovery = []

  for loc_x, loc_y, secondaryEvents in zip(filtered_Events['location.x'], filtered_Events['location.y'], filtered_Events['type.secondary']):
    if 'loss' in secondaryEvents:
      X_loss.append(loc_x)
      Y_loss.append(loc_y)

    if 'recovery' in secondaryEvents:
      X_recovery.append(loc_x)
      Y_recovery.append(loc_y)

  return X_loss, Y_loss, X_recovery, Y_recovery


In [None]:
# Method taken from https://fcpython.com/visualisation/football-heatmaps-seaborn

def plot_Loss_Recovery_HeatMap(draw_ax):

  #Pitch Outline & Centre Line
  draw_ax.plot([0,0],[0,90], color="black")
  draw_ax.plot([0,130],[90,90], color="black")
  draw_ax.plot([130,130],[90,0], color="black")
  draw_ax.plot([130,0],[0,0], color="black")
  draw_ax.plot([65,65],[0,90], color="black")

  #Left Penalty Area
  draw_ax.plot([16.5,16.5],[65,25],color="black")
  draw_ax.plot([0,16.5],[65,65],color="black")
  draw_ax.plot([16.5,0],[25,25],color="black")

  #Right Penalty Area
  draw_ax.plot([130,113.5],[65,65],color="black")
  draw_ax.plot([113.5,113.5],[65,25],color="black")
  draw_ax.plot([113.5,130],[25,25],color="black")

  #Left 6-yard Box
  draw_ax.plot([0,5.5],[54,54],color="black")
  draw_ax.plot([5.5,5.5],[54,36],color="black")
  draw_ax.plot([5.5,0.5],[36,36],color="black")

  #Right 6-yard Box
  draw_ax.plot([130,124.5],[54,54],color="black")
  draw_ax.plot([124.5,124.5],[54,36],color="black")
  draw_ax.plot([124.5,130],[36,36],color="black")

  #Prepare Circles
  centreCircle = plt.Circle((65,45),9.15,color="black",fill=False)
  centreSpot = plt.Circle((65,45),0.8,color="black")
  leftPenSpot = plt.Circle((11,45),0.8,color="black")
  rightPenSpot = plt.Circle((119,45),0.8,color="black")

  #Draw Circles
  draw_ax.add_patch(centreCircle)
  draw_ax.add_patch(centreSpot)
  draw_ax.add_patch(leftPenSpot)
  draw_ax.add_patch(rightPenSpot)

  #Prepare Arcs
  leftArc = Arc((11,45),height=18.3,width=18.3,angle=0,theta1=310,theta2=50,color="black")
  rightArc = Arc((119,45),height=18.3,width=18.3,angle=0,theta1=130,theta2=230,color="black")

  #Draw Arcs
  draw_ax.add_patch(leftArc)
  draw_ax.add_patch(rightArc)

  #Tidy Axes
  draw_ax.axis('off')

  draw_ax.set_ylim(0, 90)
  draw_ax.set_xlim(0, 130)

In [None]:
for team_Name in allTeams:
  fig, axs = plt.subplots(1, 2, figsize=(12, 5))

  axs[0].set_title(f'{team_Name} ball turnover heat map')
  axs[1].set_title(f'{team_Name} ball recovery heat map')

  X_loss, Y_loss, X_recovery, Y_recovery = get_Loss_Recovery_Coordinates(team_Name)

  plot_Loss_Recovery_HeatMap(axs[0])
  sns.kdeplot(x=X_loss, y=Y_loss, fill=True, n_levels=50, ax=axs[0])

  plot_Loss_Recovery_HeatMap(axs[1])
  sns.kdeplot(x=X_recovery, y=Y_recovery, fill=True, n_levels=50, ax=axs[1])

  plt.tight_layout()
  plt.show()

In [None]:
filtered_Team_Stats = selectedStatsFeatures['team.name'] == 'FCS Bucuresti'

selectedStatsFeatures[filtered_Team_Stats]