### Clean Data

- Change column names to make it easier to write code with
- Filter values to only include positions within valid regions of the pitch
- Sort the values for more efficient grouping

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('match_data.csv')
df.rename(columns={'Pitch_x': 'x', 'Pitch_y' : 'y',
                   'participation_id' : 'id' , 'Time (s)': 
                   'time', 'Speed (m/s)': 'speed'}, inplace=True)

df = df[(df['x'] > -52.5) &
        (df['x'] < 52.5) &
        (df['y'] > -34) &
        (df['y'] < 34)]

df.sort_values(by=['id', 'time'], inplace=True)

### Leaderboard Generation
Create a leaderboard ranking athletes based on the three key metrics:
1. Total Distance – The total distance covered by each athlete.
2. Distance at Speed Zone 5 – The cumulative distance an athlete runs while moving at a speed
between 19.8 km/h and 25.1 km/h.
3. Top Speed – The highest speed recorded for each athlete.

In [None]:
lboard1 = df[df['id'] != 'ball'][['id', 'x', 'y']] 
total_distances = (
    lboard1.groupby('id')
    .apply(lambda x: np.sqrt((x['x'].diff()**2) +
                             (x['y'].diff()**2)).sum(),
                             include_groups=False)
    .reset_index(name='total_distance')
    .sort_values('total_distance', ascending=False) 
    .reset_index(drop=True)
)
print(total_distances)

                                              x          y
id                                                        
019d8fa9-3cbd-42f2-bfcf-1bc676d13e0f -18.130694   1.660379
019d8fa9-3cbd-42f2-bfcf-1bc676d13e0f -18.133425   1.645227
019d8fa9-3cbd-42f2-bfcf-1bc676d13e0f -18.136155   1.630076
019d8fa9-3cbd-42f2-bfcf-1bc676d13e0f -18.136155   1.630076
019d8fa9-3cbd-42f2-bfcf-1bc676d13e0f -18.136155   1.630076
...                                         ...        ...
d56ca4de-1be4-499a-8242-b219e051f33c  26.924231 -17.270563
d56ca4de-1be4-499a-8242-b219e051f33c  27.382061 -17.219883
d56ca4de-1be4-499a-8242-b219e051f33c  27.398311 -17.129853
d56ca4de-1be4-499a-8242-b219e051f33c  27.414560 -17.039822
d56ca4de-1be4-499a-8242-b219e051f33c  27.430810 -16.949792

[985274 rows x 2 columns]


In [None]:
lboard2 = df[(df['id'] != 'ball') &
             (df['speed'] > 5.5) &
             (df['speed'] < 6.97)][['id', 'x', 'y']]

total_distances = (
    lboard2.groupby('id')
    .apply(lambda x: np.sqrt((x['x'].diff()**2) +
                             (x['y'].diff()**2)).sum(),
                             include_groups=False)
    .reset_index(name='total_distance')
    .sort_values('total_distance', ascending=False) 
    .reset_index(drop=True)
)




#### Max Speed
- Filter for relevant columns
- Perform the groupby, apply the function and sort ascending

In [None]:
lboard3 = df[df['id'] != 'ball'][['id', 'speed']]
max_speed = lboard3.groupby('id').max().sort_values('speed', ascending=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.ndimage import gaussian_filter

df = df.round({"x": 1, "y": 1})

plt.figure(figsize = (10,8))
df['x'] = pd.cut(df['x'], bins=60, labels=False)
df['y'] = pd.cut(df['y'], bins=40, labels=False)

position_counts = df.groupby(['x', 'y']).size().sort_values(ascending=False).reset_index(name='counts')
#print(position_counts.iloc[1:])

pos_smooth = gaussian_filter(position_counts.iloc[1:].pivot(index='x', columns='y', values='counts'), sigma=1)
#pos_smooth = gaussian_filter(position_counts.pivot(index='x', columns='y', values='counts'), sigma=1)
sns.heatmap(pos_smooth, cmap='coolwarm', cbar=False)

#print(position_counts)
plt.show()