In [None]:
import sys
sys.path.append('..//..')

from models.linear import LogisticRegressionClassifier
from joblib import load
from utils.data import split_match_into_samples, data_generator
import numpy as np
import pandas as pd
import os
from os.path import exists

def Save_to_csv (df, filename):
    df.to_csv(filename, index=True)

In [None]:
def process_single_match(match_path, filename, window_length_lookback, window_length_outlook, hidden, PI_rank, pi_list_selected, scaler, clf): 
    
    # Load the match data using the given data generator
    match = next(iter(data_generator([match_path])))

    # Split the match into time series samples for input (x), labels (y), and metadata (labels)
    x, y, labels = split_match_into_samples(
        match,
        window_length_lookback=window_length_lookback,
        window_length_outlook=window_length_outlook,
        hidden=hidden,
        pi_list=pi_list_selected,             # Select specific performance indicators
        training_goal="isGoal",               # Target label: whether a goal occurs
        folding=['any_above', 0],             # Custom sample filtering logic
        sample_rate=1                         # No downsampling
    )

    # Extend x with zeros for the full PI list (to ensure consistent dimensionality)
    x_extended = np.zeros((x.shape[0], x.shape[1], len(pi_list)))
    x_extended[:, :, indices] = x  # Insert selected PI values at their original indices

    # Normalize the extended features using the fitted scaler
    x = scaler.transform(x_extended.reshape(-1, x_extended.shape[-1])).reshape(x_extended.shape)[:, :, indices]

    # Create a "zero" input sample (all features are zero, for baseline comparison)
    zero_input = np.zeros_like(x_extended)
    zero_input = scaler.transform(zero_input.reshape(-1, zero_input.shape[-1])).reshape(zero_input.shape)[0:1, :, indices]

    # --- Create lookback windows for Home and Away teams ---

    # Split the data into two halves (Home and Away perspective)
    x1 = x[:(len(labels) // 2)]  # Home
    y1 = y[:(len(labels) // 2)].astype(int)

    x2 = x[(len(labels) // 2):]  # Away
    y2 = y[(len(labels) // 2):].astype(int)

    # Create synthetic starting sequences for the first prediction step
    # Fill beginning of sequence with zero_input to mimic temporal context
    x1_start = np.stack([
        np.concatenate([zero_input[0, :-i], x1[0, :i]], axis=0)
        for i in range(1, window_length_lookback + 1)
    ], axis=0)

    x2_start = np.stack([
        np.concatenate([zero_input[0, :-i], x2[0, :i]], axis=0)
        for i in range(1, window_length_lookback + 1)
    ], axis=0)

    # Flatten the input sequences for classifier input
    x1 = np.reshape(x1, (len(x1), -1))
    x1_start = np.reshape(x1_start, (len(x1_start), -1))

    x2 = np.reshape(x2, (len(x2), -1))
    x2_start = np.reshape(x2_start, (len(x2_start), -1))

    # --- Run Predictions ---

    # Predict probabilities for the actual input and synthetic starting states
    pred1 = clf.predict_proba(x1)
    pred2 = clf.predict_proba(x2)

    pred1_start = clf.predict_proba(x1_start)
    pred2_start = clf.predict_proba(x2_start)

    pred_zero = clf.predict_proba(np.reshape(zero_input, (1, -1)))  # Optional: could be logged or used as baseline

    # Prepend synthetic predictions to actual predictions
    pred1 = np.concatenate([pred1_start, pred1], axis=0)
    pred2 = np.concatenate([pred2_start, pred2], axis=0)

    # --- Smooth predictions with averaging ---

    # Apply moving average (with triangular padding) to smooth prediction curves
    pred1_averaged = np.concatenate([
        np.array([pred1[:i].mean() for i in range(1, window_length_outlook)]),
        np.array([np.mean(pred1[i:i + window_length_outlook]) for i in range(len(pred1) - window_length_outlook + 1)]),
        np.array([pred1[-i:].mean() for i in range(window_length_outlook - 1, 0, -1)])
    ])

    pred2_averaged = np.concatenate([
        np.array([pred2[:i].mean() for i in range(1, window_length_outlook)]),
        np.array([np.mean(pred2[i:i + window_length_outlook]) for i in range(len(pred2) - window_length_outlook + 1)]),
        np.array([pred2[-i:].mean() for i in range(window_length_outlook - 1, 0, -1)])
    ])

    # Optionally compute standard deviation for uncertainty estimation
    pred1_std = np.concatenate([
        np.array([pred1[:i].std() for i in range(1, window_length_outlook)]),
        np.array([np.std(pred1[i:i + window_length_outlook]) for i in range(len(pred1) - window_length_outlook + 1)]),
        np.array([pred1[-i:].std() for i in range(window_length_outlook - 1, 0, -1)])
    ])

    pred2_std = np.concatenate([
        np.array([pred2[:i].std() for i in range(1, window_length_outlook)]),
        np.array([np.std(pred2[i:i + window_length_outlook]) for i in range(len(pred2) - window_length_outlook + 1)]),
        np.array([pred2[-i:].std() for i in range(window_length_outlook - 1, 0, -1)])
    ])

    # Create a DataFrame to store predictions
    predictions = pd.DataFrame()
    predictions["Home"] = pred1_averaged  # Averaged probabilities for Home team
    predictions["Away"] = pred2_averaged  # Averaged probabilities for Away team

    # Save predictions to CSV
    Save_to_csv(predictions, filename + "_predictions_" + PI_rank + ".csv")


In [None]:
# Configuration for the logistic regression model as trained models
model_config = {
    "class_weight": "balanced",     # Adjusts weights inversely proportional to class frequencies (for imbalanced datasets)
    "n_jobs": -1,                   # Use all available CPU cores for parallel processing
    "max_iter": 5000                # Maximum number of iterations for solver convergence
}

# List of performance indicators (PIs) used in the study
pi_list = ['Shot', 'BP', 'BP3rd', 'BPBox', 'Goal', 'Cross','PassBox', 'Pass3rd', 'Corner', 'TackWon', 'OutpOpp', 'EntrBox', 'Entr3rd', 'Danger',
           'Shot_diff', 'BP_diff', 'BP3rd_diff', 'BPBox_diff', 'Goal_diff', 'Cross_diff','PassBox_diff', 'Pass3rd_diff', 'Corner_diff', 'TackWon_diff', 'OutpOpp_diff', 'EntrBox_diff', 'Entr3rd_diff', 'Danger_diff']

PI_rank = "rank1" ## Select the PI ranking level to apply; options: 'rank1', 'rank2', or 'rank3'

# Load the appropriate pre-trained model and PI subset based on selected PI ranking
if PI_rank == "rank3":
    clf = LogisticRegressionClassifier.load_from_path("../models/ApplicationScenario_rank3_LR-Danger-Entr3rd_diff")     # Load rank 3 logistic regression model
    indices = [pi_list.index("Entr3rd_diff"), pi_list.index("Danger")]      # Select relevant PI indices and names
    pi_list_selected = ["Entr3rd_diff", "Danger"]
    window_length_lookback = 180                                            # Number of input window intervalls
elif PI_rank == "rank2":
    clf = LogisticRegressionClassifier.load_from_path("../models/ApplicationScenario_rank2_LR-Danger_diff-TackWon")    # Load rank 2 logistic regression model
    indices = [pi_list.index("TackWon"), pi_list.index("Danger_diff")]    
    pi_list_selected = ["TackWon", "Danger_diff"]
    window_length_lookback = 60
elif PI_rank == "rank1":
    clf = LogisticRegressionClassifier.load_from_path(f"../models/ApplicationScenario_rank1_LR-OutpOpp_diff-TackWon")     # Load rank 1 logistic regression model
    indices = [pi_list.index("OutpOpp_diff"), pi_list.index("TacklingWon_Event")]    
    pi_list_selected = ["OutpOpp_diff", "TackWon"]
    window_length_lookback = 60

window_length_outlook = 36  # Number of prediction window intervalls
hidden = 12                 # Number of hidden window intervalls

#Predict event for an unseen match
scaler = load("MinMaxScaler.pkl")   # Load feature scaling object
filename = "unseen_match"           # Define input match file name
match_path = "../data/"+filename    # Build full path to the match file

# Process the match with the selected model and parameters
process_single_match(
    match_path,                 # Path to the match data
    filename,                   # Match identifier
    window_length_lookback, 
    window_length_outlook, 
    hidden, 
    PI_rank, 
    pi_list_selected, 
    scaler,                     # Feature scaler for input normalization
    clf)                        # Trained classifier
