## Match 1

In [7]:
import json

with open("../data/match_1.json", "r") as file:
    match_data_1 = json.load(file)

# Check the type of the loaded data and get the number of gaits (entries) in the dataset
data_type = type(match_data_1)
num_gaits = len(match_data_1)

data_type, num_gaits

(list, 574)

In [8]:
from collections import Counter

# Get the distribution of labels in the dataset
label_distribution = Counter([gait['label'] for gait in match_data_1])

label_distribution

Counter({'walk': 245,
         'rest': 25,
         'run': 209,
         'tackle': 20,
         'dribble': 42,
         'pass': 24,
         'cross': 3,
         'shot': 6})

In [9]:
#pip install  nbformat>=4.2.0

In [10]:
import plotly.graph_objects as go
import numpy as np
from plotly.subplots import make_subplots

# Function to plot two sample gaits for each action in the same plot
def plot_combined_gaits_plotly(action):
    # Filter gaits for the specified action
    action_gaits = [gait for gait in match_data_1 if gait['label'] == action]
    
    # Randomly select n sample gaits
    sample_gaits = np.random.choice(action_gaits, 3, replace=False)
    
    # Create plot
    fig = go.Figure()
    
    for i, gait in enumerate(sample_gaits):
        fig.add_trace(
            go.Scatter(y=gait['norm'], mode='lines', name=f"Gait {i+1}")
        )
    
    # Update layout
    fig.update_layout(height=400, width=900, title_text=f"{action} Gaits")
    fig.update_xaxes(title_text="Time (1/50 s intervals)")
    fig.update_yaxes(title_text="Acceleration Norm")
    
    return fig

actions_to_plot = ['rest', 'walk', 'run', 'dribble', 'tackle', 'pass', 
                   #'cross', 
                   'shot']

# Plot two sample gaits for different actions in the same plot using Plotly
combined_plots = [plot_combined_gaits_plotly(action) for action in actions_to_plot]

Visualizing the acceleration patterns (norm values) for a few sample gaits from different actions to get a sense of how they might differ and see if they have distinctive patterns. The gaits are selected randomly.

In [11]:
for i in range(len(combined_plots)):
    combined_plots[i].show()

- walk: the acceleration patterns for walking seem relatively steady with minor fluctuations.
- run: running gaits display more pronounced fluctuations compared to walking, indicating the increased intensity and variability in movement.
- dribble: dribbling involves interactions with the ball and might show varied patterns, as observed.
- pass: the passing action might be associated with a noticeable spike in acceleration, representing the moment of the pass.
- shot: shooting may also show a significant spike due to the force exerted during the shot.

- We can also observe the gait duration variability.

In [12]:
# Compute the duration (in seconds) of each gait based on the length of the 'norm' list
gait_durations = [(len(gait['norm']) / 50) for gait in match_data_1]  # Divide by 50 to convert to seconds

# Plot histogram of gait durations
fig = go.Figure()
fig.add_trace(go.Histogram(x=gait_durations, 
                           marker_color='blue',
                           opacity=0.75))

fig.update_layout(title_text='Distribution of Gait Durations',
                  xaxis_title='Duration (seconds)',
                  yaxis_title='Count',
                  bargap=0.05)

fig.show()

Normal distribution with a couple of outliners towards the right.

In [13]:
# Identify outliers for the gait durations using the interquartile range (IQR) method
Q1 = np.percentile(gait_durations, 25)
Q3 = np.percentile(gait_durations, 75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

color_map = {
    'walk': 'blue',
    'rest': 'green',
    'run': 'red',
    'tackle': 'purple',
    'dribble': 'orange',
    'pass': 'brown',
    'cross': 'pink',
    'shot': 'yellow'
}

# Filter outliers
outlier_gaits = [gait for gait in match_data_1 if len(gait['norm'])/50 < lower_bound or len(gait['norm'])/50 > upper_bound]
outlier_durations = [len(gait['norm'])/50 for gait in outlier_gaits]
outlier_labels = [gait['label'] for gait in outlier_gaits]

# Create a horizontal boxplot for gait durations with colored outliers based on action labels
fig_colored_outliers = go.Figure()

# Add main boxplot trace
fig_colored_outliers.add_trace(go.Box(x=gait_durations, 
                                      name='Gait Durations',
                                      boxpoints=False,
                                      marker_color='grey',
                                      orientation='h'))

# Add outliers with colored points
for label, color in color_map.items():
    label_outlier_durations = [outlier_durations[i] for i, outlier_label in enumerate(outlier_labels) if outlier_label == label]
    fig_colored_outliers.add_trace(go.Scatter(x=label_outlier_durations, 
                                              mode='markers',
                                              name=label,
                                              marker=dict(color=color, size=8)))

fig_colored_outliers.update_layout(title_text='Horizontal Boxplot of Gait Durations with Colored Outliers',
                                   xaxis_title='Duration (seconds)',
                                   yaxis_title='Action Type',
                                   showlegend=True)

The longest gaits belong to the walk and rest action which makese sense since these actions don't require the football and are independant of the course of the game.

## Match 2

In [14]:
import json

with open("../data/match_2.json", "r") as file:
    match_data_2 = json.load(file)

# Check the type of the loaded data and get the number of gaits (entries) in the dataset
data_type = type(match_data_2)
num_gaits = len(match_data_2)

data_type, num_gaits

(list, 613)

In [15]:
from collections import Counter

# Get the distribution of labels in the dataset
label_distribution = Counter([gait['label'] for gait in match_data_2])

label_distribution

Counter({'no action': 2,
         'run': 343,
         'pass': 8,
         'rest': 10,
         'walk': 190,
         'dribble': 36,
         'shot': 12,
         'tackle': 11,
         'cross': 1})

In [16]:
import plotly.graph_objects as go
import numpy as np
from plotly.subplots import make_subplots

# Function to plot two sample gaits for each action in the same plot
def plot_combined_gaits_plotly(action):
    # Filter gaits for the specified action
    action_gaits = [gait for gait in match_data_2 if gait['label'] == action]
    
    # Randomly select n sample gaits
    sample_gaits = np.random.choice(action_gaits, 3, replace=False)
    
    # Create plot
    fig = go.Figure()
    
    for i, gait in enumerate(sample_gaits):
        fig.add_trace(
            go.Scatter(y=gait['norm'], mode='lines', name=f"Gait {i+1}")
        )
    
    # Update layout
    fig.update_layout(height=400, width=900, title_text=f"{action} Gaits")
    fig.update_xaxes(title_text="Time (1/50 s intervals)")
    fig.update_yaxes(title_text="Acceleration Norm")
    
    return fig

actions_to_plot = ['rest', 'walk', 'run', 'dribble', 'tackle', 'pass', 
                   #'cross', 
                   'shot']

# Plot two sample gaits for different actions in the same plot using Plotly
combined_plots = [plot_combined_gaits_plotly(action) for action in actions_to_plot]

In [17]:
for i in range(len(combined_plots)):
    combined_plots[i].show()

In [18]:
# Compute the duration (in seconds) of each gait based on the length of the 'norm' list
gait_durations = [(len(gait['norm']) / 50) for gait in match_data_2]  # Divide by 50 to convert to seconds

# Plot histogram of gait durations
fig = go.Figure()
fig.add_trace(go.Histogram(x=gait_durations, 
                           marker_color='blue',
                           opacity=0.75))

fig.update_layout(title_text='Distribution of Gait Durations',
                  xaxis_title='Duration (seconds)',
                  yaxis_title='Count',
                  bargap=0.05)

fig.show()

In [19]:
# Identify outliers for the gait durations using the interquartile range (IQR) method
Q1 = np.percentile(gait_durations, 25)
Q3 = np.percentile(gait_durations, 75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

color_map = {
    'walk': 'blue',
    'rest': 'green',
    'run': 'red',
    'tackle': 'purple',
    'dribble': 'orange',
    'pass': 'brown',
    'cross': 'pink',
    'shot': 'yellow'
}

# Filter outliers
outlier_gaits = [gait for gait in match_data_2 if len(gait['norm'])/50 < lower_bound or len(gait['norm'])/50 > upper_bound]
outlier_durations = [len(gait['norm'])/50 for gait in outlier_gaits]
outlier_labels = [gait['label'] for gait in outlier_gaits]

# Create a horizontal boxplot for gait durations with colored outliers based on action labels
fig_colored_outliers = go.Figure()

# Add main boxplot trace
fig_colored_outliers.add_trace(go.Box(x=gait_durations, 
                                      name='Gait Durations',
                                      boxpoints=False,
                                      marker_color='grey',
                                      orientation='h'))

# Add outliers with colored points
for label, color in color_map.items():
    label_outlier_durations = [outlier_durations[i] for i, outlier_label in enumerate(outlier_labels) if outlier_label == label]
    fig_colored_outliers.add_trace(go.Scatter(x=label_outlier_durations, 
                                              mode='markers',
                                              name=label,
                                              marker=dict(color=color, size=8)))

fig_colored_outliers.update_layout(title_text='Horizontal Boxplot of Gait Durations with Colored Outliers',
                                   xaxis_title='Duration (seconds)',
                                   yaxis_title='Action Type',
                                   showlegend=True)

## Fully merged dataset

In [20]:
import pandas as pd

# Merge the two datasets (match_1_data and match_2_data) and add a "match_id" identifier

# Add match_id to match_1_data
for gait in match_data_1:
    gait['match_id'] = 1

# Add match_id to match_2_data
for gait in match_data_2:
    gait['match_id'] = 2

# Merge the two datasets
combined_data = match_data_1 + match_data_2

# Convert the combined_data to a DataFrame
df = pd.DataFrame(combined_data)

In [21]:
df.head(10)

Unnamed: 0,label,norm,match_id
0,walk,"[23.177037336396975, 23.36152528524938, 21.535...",1
1,walk,"[20.998214383911275, 19.182798059840767, 18.27...",1
2,walk,"[21.95259682019565, 20.62720484424047, 22.5554...",1
3,walk,"[19.39209748358647, 19.30460665293087, 18.9787...",1
4,rest,"[22.069263037713093, 19.672270483203395, 19.88...",1
5,walk,"[19.92032356700904, 19.672270483203395, 18.688...",1
6,walk,"[72.43782600710844, 29.1545534760874, 51.00451...",1
7,walk,"[56.86645792611747, 24.06617147011633, 39.4051...",1
8,run,"[34.587473483674074, 33.423278993730264, 21.71...",1
9,run,"[10.642204698384765, 27.15560347765402, 40.427...",1


In [22]:

# Grouping the data by match and label to get the count of each label for both matches
label_counts = df.groupby(['match_id', 'label']).size().reset_index(name='count')

# Calculating the percentage of each label for both matches
label_counts['percentage'] = label_counts.groupby('match_id')['count'].apply(lambda x: round(x / x.sum() * 100))

# Plotting the percentage of labels per game for the 2 games in the same plot
fig_percentage = go.Figure()

for match in label_counts['match_id'].unique():
    match_data = label_counts[label_counts['match_id'] == match]
    fig_percentage.add_trace(go.Bar(x=match_data['label'], y=match_data['percentage'], name=f'Match {match}'))

fig_percentage.update_layout(title='Percentage of Labels per Game for the Two Matches',
                             xaxis_title='Action Label',
                             yaxis_title='Percentage (%)',
                             barmode='group',
                             template="plotly_dark")

The distribution of the labels across both games seem to be consistent which might indicate thatt the games had the same intensity and level.

I have decided to extract extra features from the dataset because these features often provide deeper insights into the underlying patterns and relationships present in the data.

In [23]:
# Compute a new column for gait duration in seconds
df['gait_duration'] = df['norm'].apply(lambda x: len(x) / 50)

In [24]:
# Check the duration of the samples provided in terms of game time
df.groupby('match_id').agg(game_time=('gait_duration','sum'))/60

Unnamed: 0_level_0,game_time
match_id,Unnamed: 1_level_1
1,9.282333
2,9.335333


both games are approximatly 9 minutes long. (or 9 min snippets from a longer game)

In [25]:
# Descriptive Statistics for gait_duration
desc_stats = df['gait_duration'].describe()
desc_stats

count    1187.000000
mean        0.941078
std         0.608432
min         0.020000
25%         0.640000
50%         0.880000
75%         1.120000
max        14.460000
Name: gait_duration, dtype: float64

In [26]:
# Histogram for gait_duration
histogram_fig = go.Figure()
histogram_fig.add_trace(go.Histogram(x=df['gait_duration'], name='Gait Duration', marker_color='cyan'))
histogram_fig.update_layout(title='Histogram of Gait Durations', 
                            xaxis_title='Gait Duration (seconds)', 
                            yaxis_title='Frequency', 
                            template="plotly_dark")

histogram_fig

In [27]:
# Boxplots by Label for gait_duration
boxplot_fig = go.Figure()
for label in df['label'].unique():
    boxplot_fig.add_trace(go.Box(y=df[df['label'] == label]['gait_duration'], 
                                 name=label, 
                                 boxmean=True, 
                                 jitter=0.3, 
                                 pointpos=-1.8, 
                                 marker_color='lightseagreen'))
boxplot_fig.update_layout(title='Boxplots of Gait Durations by Action Label', 
                          yaxis_title='Gait Duration (seconds)', 
                          template="plotly_dark")
boxplot_fig

In [28]:
df['mean_acceleration'] = df['norm'].apply(np.mean)
df['max_acceleration'] = df['norm'].apply(np.max)
df['min_acceleration'] = df['norm'].apply(np.min)
df['std_deviation'] = df['norm'].apply(np.std)

In [29]:
df

Unnamed: 0,label,norm,match_id,gait_duration,mean_acceleration,max_acceleration,min_acceleration,std_deviation
0,walk,"[23.177037336396975, 23.36152528524938, 21.535...",1,1.44,24.166958,42.738486,15.392703,5.965416
1,walk,"[20.998214383911275, 19.182798059840767, 18.27...",1,1.08,26.026276,53.767061,15.520896,8.479134
2,walk,"[21.95259682019565, 20.62720484424047, 22.5554...",1,1.36,23.998177,42.105998,14.484489,6.328823
3,walk,"[19.39209748358647, 19.30460665293087, 18.9787...",1,1.50,22.596573,47.934160,14.867721,5.545197
4,rest,"[22.069263037713093, 19.672270483203395, 19.88...",1,3.62,20.378778,22.069263,17.789805,0.739506
...,...,...,...,...,...,...,...,...
1182,walk,"[23.337305769963503, 20.210987911153104, 25.06...",2,1.04,29.107628,65.462316,11.190245,10.920141
1183,walk,"[22.470322813933603, 22.552427730975246, 23.84...",2,1.28,26.673605,51.724442,11.657530,8.728731
1184,walk,"[43.833612705797144, 46.18045998580312, 37.492...",2,1.20,28.842287,89.365797,15.539368,12.171922
1185,walk,"[30.927599255773355, 31.26358258808756, 28.286...",2,1.32,25.121496,71.505826,7.309930,9.942930


In [30]:
df['cumulative_acceleration'] = df['norm'].apply(np.sum)

In [31]:
def compute_derivative(norm):
    try:
        return np.gradient(norm).tolist()
    except ValueError:  # Handle cases where gradient cannot be computed
        return [0] * len(norm)

df['acceleration_derivative'] = df['norm'].apply(compute_derivative)

In [32]:
df

Unnamed: 0,label,norm,match_id,gait_duration,mean_acceleration,max_acceleration,min_acceleration,std_deviation,cumulative_acceleration,acceleration_derivative
0,walk,"[23.177037336396975, 23.36152528524938, 21.535...",1,1.44,24.166958,42.738486,15.392703,5.965416,1740.020960,"[0.18448794885240716, -0.8208919667018133, 0.0..."
1,walk,"[20.998214383911275, 19.182798059840767, 18.27...",1,1.08,26.026276,53.767061,15.520896,8.479134,1405.418925,"[-1.8154163240705081, -1.362861460435882, 0.25..."
2,walk,"[21.95259682019565, 20.62720484424047, 22.5554...",1,1.36,23.998177,42.105998,14.484489,6.328823,1631.876005,"[-1.3253919759551813, 0.30140545360005433, -0...."
3,walk,"[19.39209748358647, 19.30460665293087, 18.9787...",1,1.50,22.596573,47.934160,14.867721,5.545197,1694.742968,"[-0.0874908306555966, -0.2066633027342828, 1.5..."
4,rest,"[22.069263037713093, 19.672270483203395, 19.88...",1,3.62,20.378778,22.069263,17.789805,0.739506,3688.558854,"[-2.3969925545096977, -1.0935564222211838, 0.7..."
...,...,...,...,...,...,...,...,...,...,...
1182,walk,"[23.337305769963503, 20.210987911153104, 25.06...",2,1.04,29.107628,65.462316,11.190245,10.920141,1513.596659,"[-3.126317858810399, 0.8634120080623671, 1.769..."
1183,walk,"[22.470322813933603, 22.552427730975246, 23.84...",2,1.28,26.673605,51.724442,11.657530,8.728731,1707.110690,"[0.08210491704164369, 0.6861501941098851, 0.75..."
1184,walk,"[43.833612705797144, 46.18045998580312, 37.492...",2,1.20,28.842287,89.365797,15.539368,12.171922,1730.537237,"[2.3468472800059743, -3.1706228003748116, -4.4..."
1185,walk,"[30.927599255773355, 31.26358258808756, 28.286...",2,1.32,25.121496,71.505826,7.309930,9.942930,1658.018739,"[0.3359833323142034, -1.3203278232681264, -2.8..."


In [33]:
from scipy.signal import find_peaks

# Helper function to compute the number of peaks in a gait's acceleration
def count_peaks(norm):
    # Using a basic threshold of 50% of the max value to identify peaks
    peaks, _ = find_peaks(norm, height=np.max(norm)*0.5)
    return len(peaks)

# Helper function to compute the number of valleys in a gait's acceleration
def count_valleys(norm):
    # Inverting the signal to find valleys
    valleys, _ = find_peaks([-n for n in norm], height=-np.max(norm)*0.5)
    return len(valleys)

In [34]:
df['num_peaks'] = df['norm'].apply(count_peaks)
df['num_valleys'] = df['norm'].apply(count_valleys)

In [35]:
df

Unnamed: 0,label,norm,match_id,gait_duration,mean_acceleration,max_acceleration,min_acceleration,std_deviation,cumulative_acceleration,acceleration_derivative,num_peaks,num_valleys
0,walk,"[23.177037336396975, 23.36152528524938, 21.535...",1,1.44,24.166958,42.738486,15.392703,5.965416,1740.020960,"[0.18448794885240716, -0.8208919667018133, 0.0...",14,10
1,walk,"[20.998214383911275, 19.182798059840767, 18.27...",1,1.08,26.026276,53.767061,15.520896,8.479134,1405.418925,"[-1.8154163240705081, -1.362861460435882, 0.25...",9,10
2,walk,"[21.95259682019565, 20.62720484424047, 22.5554...",1,1.36,23.998177,42.105998,14.484489,6.328823,1631.876005,"[-1.3253919759551813, 0.30140545360005433, -0....",12,14
3,walk,"[19.39209748358647, 19.30460665293087, 18.9787...",1,1.50,22.596573,47.934160,14.867721,5.545197,1694.742968,"[-0.0874908306555966, -0.2066633027342828, 1.5...",6,14
4,rest,"[22.069263037713093, 19.672270483203395, 19.88...",1,3.62,20.378778,22.069263,17.789805,0.739506,3688.558854,"[-2.3969925545096977, -1.0935564222211838, 0.7...",55,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1182,walk,"[23.337305769963503, 20.210987911153104, 25.06...",2,1.04,29.107628,65.462316,11.190245,10.920141,1513.596659,"[-3.126317858810399, 0.8634120080623671, 1.769...",8,13
1183,walk,"[22.470322813933603, 22.552427730975246, 23.84...",2,1.28,26.673605,51.724442,11.657530,8.728731,1707.110690,"[0.08210491704164369, 0.6861501941098851, 0.75...",5,11
1184,walk,"[43.833612705797144, 46.18045998580312, 37.492...",2,1.20,28.842287,89.365797,15.539368,12.171922,1730.537237,"[2.3468472800059743, -3.1706228003748116, -4.4...",4,12
1185,walk,"[30.927599255773355, 31.26358258808756, 28.286...",2,1.32,25.121496,71.505826,7.309930,9.942930,1658.018739,"[0.3359833323142034, -1.3203278232681264, -2.8...",1,14


In [36]:
import plotly.graph_objects as go

# Extracting the acceleration data for the first row
acceleration_data = df['norm'].iloc[0]

# Identifying the peaks using find_peaks
peaks, _ = find_peaks(acceleration_data, height=np.max(acceleration_data)*0.5)

# Identifying the valleys by inverting the acceleration data and using find_peaks
valleys, _ = find_peaks([-val for val in acceleration_data], height=-np.max(acceleration_data)*0.5)


# Plotting the acceleration data and marking the peaks
fig = go.Figure()

fig.add_trace(go.Scatter(y=acceleration_data, mode='lines', name='Acceleration'))
fig.add_trace(go.Scatter(x=peaks, y=[acceleration_data[j] for j in peaks], mode='markers', marker=dict(size=8, color='red'), name='Peaks'))
fig.add_trace(go.Scatter(x=valleys, y=[acceleration_data[j] for j in valleys], mode='markers', marker=dict(size=8, color='blue'), name='Valleys'))

fig.update_layout(title='Acceleration Data with Detected Peaks', xaxis_title='Time (20ms intervals)', yaxis_title='Acceleration Norm', template="plotly_dark")
fig.show()


We can see the 14 peaks and the 10 valleys calulcated earlier for the first row of the dataset

In [37]:
df.groupby('label')['num_peaks','num_valleys'].describe()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0_level_0,num_peaks,num_peaks,num_peaks,num_peaks,num_peaks,num_peaks,num_peaks,num_peaks,num_valleys,num_valleys,num_valleys,num_valleys,num_valleys,num_valleys,num_valleys,num_valleys
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
cross,4.0,1.25,0.5,1.0,1.0,1.0,1.25,2.0,4.0,11.25,4.573474,6.0,8.25,11.5,14.5,16.0
dribble,78.0,3.192308,2.156583,0.0,1.0,3.0,5.0,11.0,78.0,9.24359,4.607126,0.0,6.0,9.0,12.0,23.0
no action,2.0,3.5,2.12132,2.0,2.75,3.5,4.25,5.0,2.0,36.5,20.506097,22.0,29.25,36.5,43.75,51.0
pass,32.0,3.34375,2.390969,1.0,1.75,3.0,4.25,10.0,32.0,10.03125,6.664709,0.0,5.75,8.0,15.25,26.0
rest,35.0,17.285714,20.23621,2.0,5.0,10.0,20.0,91.0,35.0,18.657143,36.745502,0.0,0.0,7.0,22.5,210.0
run,552.0,3.483696,2.422615,0.0,1.0,3.0,5.0,13.0,552.0,8.400362,3.717453,0.0,6.0,8.0,10.0,24.0
shot,18.0,2.944444,2.154946,0.0,1.25,2.5,4.0,8.0,18.0,8.333333,5.12204,1.0,4.0,8.0,12.25,18.0
tackle,31.0,2.548387,1.670104,1.0,1.0,2.0,3.5,7.0,31.0,11.741935,6.74768,3.0,7.5,9.0,16.0,30.0
walk,435.0,5.186207,3.109343,1.0,3.0,5.0,7.0,20.0,435.0,11.558621,4.936289,0.0,9.0,11.0,14.0,45.0


"Rest" has a high average number of peaks and valleys, indicating frequent but small movements, whereas actions like "Dribble" and "Pass" have fewer peaks and valleys, reflecting more distinct and pronounced movements.