In [1]:
import os
import boto3
import bz2
import json
import io
import math
from tqdm import tqdm
import pandas as pd
import numpy as np

os.environ["AWS_ACCESS_KEY_ID"] = "REDACTED"  
os.environ["AWS_SECRET_ACCESS_KEY"] = "REDACTED" 
os.environ["AWS_REGION"] = "us-east-1"  # Default aws region

S3_BUCKET = "shot-probability"
s3_client = boto3.client("s3")

In [2]:
type_dict = {
    'is_home': 'boolean', 'competition': 'string', 'season': 'string',
    'player_name': 'string', 'player_id': 'int64'
}

atk_22 = pd.concat([pd.read_csv(f"s3://{S3_BUCKET}/merged_data/train_pl_2022-2023_{i}.csv", dtype=type_dict) for i in range(19)]).reset_index(drop=True)
atk_23 = pd.concat([pd.read_csv(f"s3://{S3_BUCKET}/merged_data/train_pl_2023-2024_{i}.csv", dtype=type_dict) for i in range(19)]).reset_index(drop=True)
atk_24 = pd.concat([pd.read_csv(f"s3://{S3_BUCKET}/merged_data/train_pl_2024-2025_{i}.csv", dtype=type_dict) for i in range(19)]).reset_index(drop=True)

atk = pd.concat([atk_22, atk_23, atk_24], ignore_index=True)
atk

Unnamed: 0,r,theta,z,speed,GK_r,GK_theta,openGoal,DefDist0,DefAngle0,OffDist0,...,season,game,attack_merged,period,is_home,frameNum,periodGameClockTime,attack_team_id,player_name,player_id
0,36.632069,0.942201,1.34,0.000000,7.535576,0.752080,0.691115,2.443856,1.300309,5.508077,...,2022-2023,4436,1,0,True,2966,45.778632,7,Wilfried Zaha,480
1,36.655604,0.941318,1.52,1.198789,7.529676,0.750269,0.684063,2.423591,1.313809,5.544241,...,2022-2023,4436,1,0,True,2967,45.811999,7,Wilfried Zaha,480
2,36.693140,0.940377,1.52,1.528162,7.523800,0.748455,0.677959,2.413045,1.332753,5.594073,...,2022-2023,4436,1,0,True,2968,45.845366,7,Wilfried Zaha,480
3,36.716733,0.939497,1.64,1.198825,7.517949,0.746637,0.670869,2.397010,1.346562,5.630581,...,2022-2023,4436,1,0,True,2969,45.878732,7,Wilfried Zaha,480
4,36.740355,0.938618,1.64,1.198789,7.512801,0.744915,0.663881,2.383186,1.361139,5.666987,...,2022-2023,4436,1,0,True,2970,45.912099,7,Wilfried Zaha,480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15969111,4.577619,-0.887526,2.52,12.890469,1.685922,-1.739455,0.518198,1.342670,2.558754,3.138281,...,2024-2025,32374,259,1,,181721,5659.626097,119,Kristoffer Ajer,5051
15969112,4.188090,-0.851292,2.31,12.605491,1.765986,-1.732314,0.561067,1.659188,2.761870,3.028083,...,2024-2025,32374,259,1,,181722,5659.659463,119,Kristoffer Ajer,5051
15969113,3.805181,-0.807699,2.31,12.605113,1.844989,-1.725341,0.000000,1.297000,0.294928,2.976981,...,2024-2025,32374,259,1,,181723,5659.692830,119,Kristoffer Ajer,5051
15969114,3.416972,-0.754352,2.03,12.984193,1.926602,-1.717171,0.000000,0.857735,0.289670,2.996273,...,2024-2025,32374,259,1,,181724,5659.726197,119,Kristoffer Ajer,5051


In [3]:
# Removing games where no shots are recorded (likely due to error)
atk = atk[~atk['game'].isin([4554, 13525, 32127])].reset_index(drop=True)
# Removing frames where no team has clear possession
atk = atk[atk['is_home'].notna()].reset_index(drop=True)

In [4]:
baseline_features = [['r'], ['theta', 'z', 'speed'], ['openGoal', 'GK_r', 'GK_theta']]
baseline_features.append(['DefDist0', 'DefAngle0', 'OffDist0', 'OffAngle0'])
players = []
for i in range(1, 5):
    players.append(f'DefDist{i}')
    players.append(f'DefAngle{i}')
    players.append(f'OffDist{i}')
    players.append(f'OffAngle{i}')
baseline_features.append(players)

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import log_loss

target = 'is_goal'
shots = atk[atk.is_shot].reset_index(drop=True)
groups = shots['game']

for i in range(len(baseline_features)):
    feat_i = []
    for j in range(i + 1):
        feat_i += baseline_features[j]

    X, y = shots[feat_i], shots[target]
    sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
    folds = list(sgkf.split(X, y, groups=groups))

    fold_losses = []
    for fold_idx, (train_idx, test_idx) in enumerate(folds):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        lr_model = LogisticRegression(max_iter=1000, solver='lbfgs')
        lr_model.fit(X_train_scaled, y_train)

        y_prob = lr_model.predict_proba(X_test_scaled)[:, 1]
        loss = log_loss(y_test, y_prob)
        fold_losses.append(loss)

    mean_loss = np.mean(fold_losses)
    std_loss = np.std(fold_losses)
    print(f"Features up to index {i}: Mean CV log loss = {mean_loss:.5f} ± {std_loss:.5f}")

Features up to index 0: Mean CV log loss = 0.35601 ± 0.01058
Features up to index 1: Mean CV log loss = 0.35012 ± 0.00869
Features up to index 2: Mean CV log loss = 0.34035 ± 0.00864
Features up to index 3: Mean CV log loss = 0.33972 ± 0.00894
Features up to index 4: Mean CV log loss = 0.33713 ± 0.00855


In [12]:
target = 'hasShotsIn1s'
groups = atk['game']

for i in range(len(baseline_features)):
    feat_i = []
    for j in range(i + 1):
        feat_i += baseline_features[j]

    X, y = atk[feat_i], atk[target]
    sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
    folds = list(sgkf.split(X, y, groups=groups))

    fold_losses = []
    for fold_idx, (train_idx, test_idx) in enumerate(folds):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        lr_model = LogisticRegression(max_iter=1000, solver='lbfgs')
        lr_model.fit(X_train_scaled, y_train)

        y_prob = lr_model.predict_proba(X_test_scaled)[:, 1]
        loss = log_loss(y_test, y_prob)
        fold_losses.append(loss)

    mean_loss = np.mean(fold_losses)
    std_loss = np.std(fold_losses)
    print(f"Features up to index {i}: Mean CV log loss = {mean_loss:.5f} ± {std_loss:.5f}")

Features up to index 0: Mean CV log loss = 0.02624 ± 0.00053
Features up to index 1: Mean CV log loss = 0.02594 ± 0.00053
Features up to index 2: Mean CV log loss = 0.02575 ± 0.00054
Features up to index 3: Mean CV log loss = 0.02525 ± 0.00055
Features up to index 4: Mean CV log loss = 0.02508 ± 0.00060
