In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
import tensorflow as tf
from keras.layers import Dense, LSTM, Dropout
from collections import Counter
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model

In [None]:
data = pd.read_csv('~/Desktop/SportAnalytics/data_with_zones.csv')

# Desired features to train on
features = ['opposingteamid', 'teamid', 'compiledgametime', 'eventname', 'manpowersituation', 'outcome', 'type', 'zone', 'sequence_id']
le_features = ['le_opposingteamid', 'le_teamid', 'le_compiledgametime', 'le_eventname', 'le_manpowersituation', 'le_outcome', 'le_type', 'le_zone', 'le_sequence_id']

# Target output
target = 'xg'

# Transform (encode) categorical features to numerical
le = LabelEncoder()
data['le_opposingteamid'] =  le.fit_transform(data['opposingteamid'])
data['le_teamid'] =  le.fit_transform(data['teamid'])
data['le_compiledgametime'] =  le.fit_transform(data['compiledgametime'])
data['le_eventname'] =  le.fit_transform(data['eventname'])
data['le_manpowersituation'] =  le.fit_transform(data['manpowersituation'])
data['le_outcome'] =  le.fit_transform(data['outcome'])
data['le_type'] =  le.fit_transform(data['type'])
data['le_zone'] =  le.fit_transform(data['zone'])
data['le_sequence_id'] =  le.fit_transform(data['sequence_id'])

#print(data.loc[:, ['zone', 'le_zone']])

features = le_features

# Fill "na"-values with zeros in xg column
data[target] = data[target].fillna(0)

# Normalize features
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

# Split data into train- and test data
train, test = train_test_split(data[features + [target]], test_size=0.2, random_state=42, shuffle=False)

In [22]:
def sequence_data(data, sequence_length):
    sequences = []
    targets = []
    start_indices = []
    for i in range(len(data)):
        # Check if i:th event is a shot
        if data.iloc[i][target] > 0:
            sequence_id = data.iloc[i]['le_sequence_id']
            target_ = data.iloc[i][target]
            
            # Find the rows with the same sequence_id as the shot
            sequence_indices = np.where(data['le_sequence_id'].values[:i] == sequence_id)[0]

            # New sequence length (shortest of "sequence_length" and the available sequence)
            seq_len = min(sequence_length, len(sequence_indices))

            # Index of first event in sequence
            start_index = sequence_indices[-seq_len]
            start_indices.append(start_index)
                
            # One whole sequence from start_index to shot
            sequence = data.iloc[start_index:i][features].values

            #entries.append(entry)
            sequences.append(sequence)
            targets.append(target_)             
    return np.array(sequences, dtype=object), np.array(targets), np.array(start_indices)

In [45]:
# Length of sequence before shot
sequence_length = 100

# Create train- and test sequences
x_train_seq, y_train_seq, train_start_indices = sequence_data(train, sequence_length)
x_test_seq, y_test_seq, test_start_indices = sequence_data(test, sequence_length)

train_entries = data.iloc[train_start_indices]['le_eventname'].values
test_entries = data.iloc[test_start_indices]['le_eventname'].values

train_entry_count = Counter(train_entries)
test_entry_count = Counter(test_entries)

# Pad sequences to have the same length
x_train_seq = pad_sequences(x_train_seq, maxlen=sequence_length, dtype='float32')
x_test_seq = pad_sequences(x_test_seq, maxlen=sequence_length, dtype='float32')

# Convert the NumPy arrays to tensors
x_train_seq = tf.convert_to_tensor(x_train_seq)
y_train_seq = tf.convert_to_tensor(y_train_seq)
x_test_seq = tf.convert_to_tensor(x_test_seq)
y_test_seq = tf.convert_to_tensor(y_test_seq)

# Define the input shape
input_shape = (sequence_length, len(features))

In [None]:
# Create the RNN model
model = Sequential()
model.add(LSTM(64, input_shape=input_shape, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(32))
model.add(Dropout(0.1))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='mean_squared_error', optimizer='adam')
fit_model = model.fit(x_train_seq, y_train_seq, epochs=100, batch_size=32, validation_data=(x_test_seq, y_test_seq), verbose=1)
mse = model.evaluate(x_test_seq, y_test_seq)

In [None]:
# Plot the training and validation loss
plt.plot(fit_model.history['loss'])
plt.plot(fit_model.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.xlim(0, 100)
plt.ylim(0, 0.01)
plt.legend(['Train', 'Test'], loc='upper right')
plt.show()

In [None]:
y_pred = model.predict(x_test_seq)

# Convert tensor to NumPy array and flatten
y_test_seq_flat = y_test_seq.numpy().flatten()
y_pred_flat = y_pred.flatten()

# Remove some "outliers"
mask = (y_test_seq_flat <= 0.6) & (y_pred_flat <= 0.6)
y_test_out = y_test_seq_flat[mask]
y_pred_out = y_pred_flat[mask]

In [50]:
# eventnames
unique_event_names = data['eventname'].unique()
encoded_events = []

for event_name in unique_event_names:
    unique_le_event_names = data[data['eventname'] == event_name]['le_eventname'].unique()
    for le_event_name in unique_le_event_names:
        encoded_events.append(f"{event_name}: {le_event_name}")
#print(encoded_events)

decoded_names = []
encoded_names = []

for event in encoded_events:
    event_name = event.split(":")[0]
    decoded_names.append(event_name)

for event in encoded_events:
    event_name = event.split(":")[1]
    encoded_names.append(event_name)


# zone
unique_zone_names = data['zone'].unique()
encoded_zone_names = []

for zone_name in unique_zone_names:
    unique_le_zone_names = data[data['zone'] == zone_name]['le_zone'].unique()
    for le_zone_name in unique_le_zone_names:
        encoded_zone_names.append(f"{zone_name}: {le_zone_name}")

decoded_zone = []
encoded_zone = []

for i in encoded_zone_names:
    zone_name = i.split(":")[0]
    decoded_zone.append(zone_name)

for i in encoded_zone_names:
    zone_name = i.split(":")[1]
    encoded_zone.append(zone_name)

# High xG sequences

In [None]:
n = 100
top_sequences = [] 
total_eventname_count = Counter()
total_zone_count = Counter()

# Iterate over the test set and find the sequences with the highest predicted xg values
for i in range(len(y_pred)):
    predicted_xg = y_pred[i]
    sequence = x_test_seq[i]
    top_sequences.append((predicted_xg, sequence))

# Sort the list of top sequences based on predicted xg values in descending order
top_sequences = sorted(top_sequences, reverse=True)

# Get the top n sequences
top_n_sequences = top_sequences[:n]

zone_counter = 0
name_counter = 0

# Print the top n sequences
for i in range(n):
    final_eventname_list = [] # list of final eventnames
    final_zone_list = []
    sequence = top_n_sequences[i][1]
    
    for j in range(len(sequence)):
        for k in range(len(encoded_names)):
            if (str(sequence[:,3][j].numpy()) == '0.0'):
                continue
            if (str(sequence[:,3][j].numpy())[:-1] in encoded_names[k]):
                final_eventname_list.append(decoded_names[k])
                name_counter = name_counter + 1
                break
            else: 
                continue

        for k in range(len(encoded_zone)):
            if (str(sequence[:,7][j].numpy()) == '0.0'):
                continue
            if (str(sequence[:,7][j].numpy())[:-1] in encoded_zone[k]):
                final_zone_list.append(decoded_zone[k])
                zone_counter = zone_counter + 1
                break
            else: 
                continue
    
    # Print the sequence
    #print(f"Sequence with the {i+1} highest predicted xg: {final_zone_list}")
    
    name_count = Counter(final_eventname_list)
    zone_count = Counter(final_zone_list)
    #print(name_count)
    total_eventname_count.update(name_count)
    total_zone_count.update(zone_count)

#print(total_count)

avg_eventname_count = {}
for event, count in total_eventname_count.items():
    avg_eventname_count[event] = count / n
sorted_avg_count = dict(sorted(avg_eventname_count.items(), key=lambda item: item[1], reverse=True))
#print(sorted_avg_count)


avg_zone_count = {}
for event, count in total_zone_count.items():
    avg_zone_count[event] = count / n
sorted_zone_count = dict(sorted(avg_zone_count.items(), key=lambda item: item[1], reverse=True))
#print(sorted_zone_count)

avg_name_len = name_counter/n
print(avg_name_len)

avg_zone_len = zone_counter/n
print(avg_zone_len)

# Low xG sequences

In [None]:
n = 100
bottom_sequences = []
total_count = Counter()
total_zone_count = Counter()

# Iterate over the test set and find the sequences with the lowest predicted xg values
for i in range(len(y_pred)):
    predicted_xg = y_pred[i]
    sequence = x_test_seq[i]
    bottom_sequences.append((predicted_xg, sequence))

# Sort the list of top sequences based on predicted xg values in descending order
bottom_sequences = sorted(bottom_sequences, reverse=False)

# Get the top n sequences
bottom_n_sequences = bottom_sequences[:n]

zone_counter = 0
name_counter = 0

# Print the top n sequences
for i in range(n):
    final_list = [] # list of final eventnames
    final_zone_list = []
    sequence = bottom_n_sequences[i][1]
    
    for j in range(len(sequence)):
        for k in range(len(encoded_names)):
            if (str(sequence[:,3][j].numpy()) == '0.0'):
                continue
            if (str(sequence[:,3][j].numpy())[:-1] in encoded_names[k]):
                final_list.append(decoded_names[k])
                name_counter = name_counter + 1
                break
            else: 
                continue

        for k in range(len(encoded_zone)):
            if (str(sequence[:,7][j].numpy()) == '0.0'):
                continue
            if (str(sequence[:,7][j].numpy())[:-1] in encoded_zone[k]):
                final_zone_list.append(decoded_zone[k])
                zone_counter = zone_counter + 1
                break
            else: 
                continue
            
    # Print the sequence
    #print(f"Sequence with the {i+1} lowest predicted xg: {final_list}")

    name_count = Counter(final_list)
    zone_count = Counter(final_zone_list)

    #print(name_count)
    total_count.update(name_count)
    total_zone_count.update(zone_count)

#print(total_count)

avg_count = {}
for event, count in total_count.items():
    avg_count[event] = count / n
sorted_avg_count = dict(sorted(avg_count.items(), key=lambda item: item[1], reverse=True))
print(sorted_avg_count)

avg_zone_count = {}
for event, count in total_zone_count.items():
    avg_zone_count[event] = count / n
sorted_zone_count = dict(sorted(avg_zone_count.items(), key=lambda item: item[1], reverse=True))
print(sorted_zone_count)

avg_zone_len = zone_counter/n
print(avg_zone_len)

avg_name_len = name_counter/n
print(avg_name_len)

In [None]:
# Plot the scatter plot with the regression line
plt.scatter(y_test_out, y_pred_out, color='green')
plt.xlabel('True values')
plt.ylabel('Predicted values')
plt.xlim(0, 0.6)
plt.ylim(0, 0.3)

# Regression line
m, b = np.polyfit(y_test_out, y_pred_out, 1)
plt.plot(y_test_out, m*y_test_out+b, color='red')
plt.plot(y_test_out, y_test_out, color='blue')

plt.show()

In [None]:
print(f"Max predicted goal rate: {np.max(y_pred_out):.10f}")
print(f"Min predicted goal rate: {np.min(y_pred_out):.10f}")

print(f"Max test goal rate: {np.max(y_test_out):.10f}")
print(f"Min test goal rate: {np.min(y_test_out):.10f}")

print("Mean Squared Error (MSE):", mse)