# Average calculator:
This notebook is part of the offline and online program of the master thesis of Theo Vandeportaele. It calculates the average values that can be used to represent the average lines on the distance and velocity graphs. These values are calculated based on the previous games during a season. The notebook consists of two main parts:
- Average Distance Calculator
- Average Velocity Calculator

In [None]:
import os
import floodlight.io.statsperform
import json
import pandas as pd
import numpy as np 
import re

from tqdm import tqdm
from floodlight.core.xy import XY
from floodlight.models.kinematics import DistanceModel
from floodlight.models.kinematics import VelocityModel

In [None]:
path = 'data/'

In [None]:
def numerical_sort(value):
    numbers = re.compile(r'(\d+)')
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

## Average Distance Calculator

In [None]:
dist_dict = {}

In [None]:
# Itterate over all folders in the path folder
# Each folder represents the data of a different game played this season

for file in tqdm(sorted(os.listdir(path), key=numerical_sort)):
    # Create tracking file full path
    filename = os.fsdecode(file)
    filename_tracking_data = path + filename
    print(filename_tracking_data)
        
    # Get teamsheets of tracking file
    teamsheets = floodlight.io.statsperform.read_teamsheets_from_position_data_txt(filename_tracking_data)

    # Find out if Club Brugge is the home or away team     
    truncated_filename = filename.split("-", 1)[-1]
    print(truncated_filename)
    if truncated_filename.startswith("Club Brugge"):
        team = 'Home'
    else: 
        team = 'Away'
    print(team)

    # Get tracking data
    data = floodlight.io.statsperform.read_position_data_txt(filename_tracking_data)

    #
    #
    # CALCULATION OF AVERAGE DISTANCE
    #
    #
    
    # Create distance model and get cumulative distance of first half
    xy_values = data[0][1][team]

    dm = DistanceModel()
    dm.fit(xy_values)
    cumulative_distance_covered = dm.cumulative_distance_covered()

    # Create distance model and get cumulative distance of second half
    xy_values_second = data[0][2][team]

    dm_2 = DistanceModel()
    dm_2.fit(xy_values_second)
    cumulative_distance_covered_2 = dm_2.cumulative_distance_covered()

    # Create id_mapping table to map the tracking file ID to the shirt number of the player
    id_mapping = data[1][team]    

    # Itterate over all the players that player in that specific game
    for player_id in teamsheets[team]['jID']:

        # Use the id_mapping table to get the shirt number of the current player
        mapped_index = id_mapping[id_mapping['jID'] == player_id]['xID'].values.tolist()[0]-1

        # Get the cumulative distance data of only that specific player
        cumulative_distance_data = cumulative_distance_covered.property[:, mapped_index]
        cumulative_distance_data_2 = cumulative_distance_covered_2.property[:, mapped_index]

        # Add the last element of the cumulative model of the first half to all the values of the second half
        # Otherwise the values of the second half start again from 0
        last_element_data_1 = cumulative_distance_data[-1]
        result_array = cumulative_distance_data_2 + last_element_data_1

        # Concatenate the cumulative data from the first and the second half
        total_cum_data = np.concatenate((cumulative_distance_data, result_array), axis=0)

        # Create buckets that contain the total cumulative data per minute instead of the cumulative data per 0.04 seconds (original is per 0.04 seconds) 
        data_points_per_minute = 25 * 60
        downsampled_data = total_cum_data[::data_points_per_minute]
        cumulative_distance_per_minute = np.gradient(downsampled_data)

        # If cumulative data per minute = 0; than delete it. This only happens after a player is substituted out or before a player is substituted in
        cumulative_distance_per_minute = cumulative_distance_per_minute[cumulative_distance_per_minute != 0]

        # Use a kernel to smoothen the graph. 
        kernel_size = 20
        kernel = np.ones(kernel_size) / kernel_size
        padded_counts_array = np.pad(cumulative_distance_per_minute, (kernel_size // 2, kernel_size // 2), mode='symmetric')
        smoothed_array = np.convolve(padded_counts_array, kernel, mode='valid')

        # Calculate the average value of the smoothed array
        average_value = np.mean(smoothed_array)
        
        # Save this average value in a dictionary. If the player_id isn't yet present, just save it.
        # If the player_id is already present, save the average of that value and the average_value. This makes sure that recent games have a bigger
        # influence on the average_value than the first couple of games of the season. 
        if smoothed_array.size >= 45:
            if player_id in dist_dict: 
                dist_dict[player_id] = (dist_dict[player_id] + average_value) / 2
            else:
                dist_dict[player_id] = average_value    

In [None]:
# Save the average values in the dictionary to a json file 
file_path = "average_distance_05.json"

with open(file_path, 'w') as json_file:
    json.dump(dist_dict, json_file)

In [None]:
dist_dict

## Average Velocity Calculator

In [None]:
vel_dict = {}
vel_avg_dict = {}

In [None]:
for file in tqdm(sorted(os.listdir(path), key=numerical_sort)):
    # Create tracking file full path
    filename = os.fsdecode(file)
    filename_tracking_data = path + filename
    print(filename_tracking_data)
        
    # Get teamsheets of tracking file
    teamsheets = floodlight.io.statsperform.read_teamsheets_from_position_data_txt(filename_tracking_data)
    # print(teamsheets)

    # Find out if Club Brugge is the home or away team    
    truncated_filename = filename.split("-", 1)[-1]
    print(truncated_filename)
    if truncated_filename.startswith("Club Brugge"):
        team = 'Home'
    else: 
        team = 'Away'
    print(team)

    # Get tracking data
    data = floodlight.io.statsperform.read_position_data_txt(filename_tracking_data)

    #
    #
    # CALCULATION OF AVERAGE VELOCITY
    #
    #

    # Create velocity model and get velocity of first half
    xy_values = data[0][1][team]

    vm = VelocityModel()
    vm.fit(xy_values)
    vm.velocity()

    # Create velocity model and get velocity of second half
    xy_values_second = data[0][2][team]

    vm_2 = VelocityModel()
    vm_2.fit(xy_values_second)
    vm_2.velocity()

    # Create id_mapping table to map the tracking file ID to the shirt number of the player
    id_mapping = data[1][team]    
    
    for player_id in teamsheets[team]['jID']:
        
        # Use the id_mapping table to get the shirt number of the current player
        mapped_index = id_mapping[id_mapping['jID'] == player_id]['xID'].values.tolist()[0]-1

        # Get the velocity of the specific player
        velocity_1 = vm.velocity()[:,mapped_index]
        velocity_2 = vm_2.velocity()[:,mapped_index]

        # Concatenate velocity of first and second half 
        total_velocity = np.concatenate((velocity_1, velocity_2), axis=0)

        # Calculate the average velocity of the player and multiply by 2
        avg = np.nanmean(total_velocity)
        
        velocity = avg*2
        if player_id in vel_avg_dict: 
            vel_avg_dict[player_id] = (vel_avg_dict[player_id] + avg) / 2
        else:
            vel_avg_dict[player_id] = avg    


        # Like explained during my presentation. I'm going to calculate the amount of frames above a certain value (2*avg_velocity) per minute. 
        # Create a mask to see when the value is above the velocity value and count the amount of times it's higher. 
        above_threshold_mask = total_velocity > velocity
        elements_above_threshold = np.sum(above_threshold_mask)

        # Make buckets per minute and remove the 0 values, since values are only 0 if 
        minutes = 1
        frame_size = 25*minutes*60
        num_frames = len(above_threshold_mask) // frame_size
        counts_array = np.zeros(num_frames, dtype=int)
        above_threshold_mask_reshaped = above_threshold_mask[:num_frames * frame_size].reshape(num_frames, frame_size)
        counts_array = np.sum(above_threshold_mask_reshaped, axis=1)
        counts_array = np.trim_zeros(counts_array, 'b')

        # Calculate the average value and add it to the dictionary. If the player_id isn't yet present, just save it.
        # If the player_id is already present, save the average of that value and the average_value. This makes sure that recent games have a bigger
        # influence on the average_value than the first couple of games of the season. 
        average_value = np.mean(counts_array)
        if average_value >= 100:
            if player_id in vel_dict: 
                vel_dict[player_id] = (vel_dict[player_id] + average_value) / 2
            else:
                vel_dict[player_id] = average_value    


In [None]:
# Save the dictionary to a json file. 
file_path = "average_velocity_frames_05.json"

# Write the dictionary to the JSON file
with open(file_path, 'w') as json_file:
    json.dump(vel_dict, json_file)

In [None]:
# Save the dictionary to a json file. 
file_path = "average_velocity_05.json"

# Write the dictionary to the JSON file
with open(file_path, 'w') as json_file:
    json.dump(vel_avg_dict, json_file)

In [None]:
vel_dict

In [None]:
vel_avg_dict