In [2]:
import import_ipynb
import numpy as np
import pandas as pd

#import stat_functions

In [3]:
filepath = "C:/Users/trfit/OneDrive/Documents/Hobbies/CRG/stats 2025/prepped_data/" 
file = "VL_Akron_data.csv"
df = pd.read_csv(filepath+file)


# Feature preparation

Jam Embeddings: | Jam_id | point_diff | lead | Trips | jammer_penalty_counter | blocker_penalty_counter | skater1_as_jammer |...| skatern_as_jammer | skater1_as_blocker |...| skatern_as_blocker |

curr_col = ['OP','Half','Jam',
            'LEAD', #'OP_LEAD',
            'Jammer', 'Pivot', 'Blocker_1', 'Blocker_2', 'Blocker_3',
            'Trips','OP_Trips', 
            'Jam Total', 'Game Total','OP_Jam Total', 'OP_Game Total',
            'Jammer_Box_1', 'Jammer_Box_2','Jammer_Box_3',
            'Pivot_Box_1','Pivot_Box_2','Pivot_Box_3',
            'Blocker_1_Box_1','Blocker_1_Box_2','Blocker_1_Box_3',
            'Blocker_2_Box_1','Blocker_2_Box_2','Blocker_2_Box_3',
            'Blocker_3_Box_1','Blocker_3_Box_2','Blocker_3_Box_3']

Ready: 
 - game_id: OP
 - points_for: Jam Total, Game Total
 - lead_for_us (0/1): boolean
 - our_jammer (string)
 - jam_duration: Trips
 - points_against: OP_Jam Total, OP_Game Total

Need work:
- jam_id: need to separate by game & by half
- our_blockers (list/JSON/string of 4 skater IDs): separate columns, int type

Dont have
 - penalties_our_team (int)
 - penalties_their_team (int)


box characters: 
- –  = Entered the box mid-jam.
- S = Entered between jams or continued from previous jam.
- | = Finished service.
- '+' combines - & |
- 'dollar sign' Combines + and S. Almost always paired with a - in the previous jam
- 3 = Skater injured or needed sub, sits out 3 jams.

In [4]:
# Functions

def build_encoded_matrix(df, cols, skaters):
    # This function calculates the average normalized jam +/- for each skater using matrix multiplication. This method is used because it makes 
    # it easier to remove a skater and recalculate.

    df_encoded = pd.get_dummies(df, columns=cols)

    # Combine the position boolean columns, so each skater has one boolean column, True = skater was in the jam, False = skater was not in the jam
    #cols = ['Jammer', 'Pivot', 'Blocker 1', 'Blocker 2', 'Blocker 3']
    for skater in skaters:
        cols_comb = []
        for col in cols:
            if col+"_"+str(skater) in df_encoded.columns:
                cols_comb.append(col+"_"+str(skater))
        df_encoded[str(skater)] = df_encoded[cols_comb].any(axis='columns')
    
        for col in cols_comb:
            df_encoded = df_encoded.drop(col, axis=1)

    return df_encoded

In [24]:
# Create feature df
# Jam Embeddings: | x jam_id | x point_diff | x lead | x trips | x jammer_penalty_counter | x blocker_penalty_counter | skater1_as_jammer |...| skatern_as_jammer | skater1_as_blocker |...| skatern_as_blocker |
X_col = []
df_features = pd.DataFrame()

# Make Jam_id
for col in ['OP', 'Half',  'Jam']:
    df[col] = df[col].astype("string")
df_features['jam_id'] = df['OP'] +"_"+ df['Half'] +"_"+ df['Jam']

# Make other features
df_features['point_diff'] = df['Jam Total'] - df['OP_Jam Total']
df_features['lead'] = df['LEAD'].replace({np.nan: False, "X": True}).infer_objects(copy=False)
df_features['trips'] = df['Trips']
X_col += ['lead', 'trips']

# Make jammer_penalty_counter
for col in ['Jammer_Box_1', 'Jammer_Box_2', 'Jammer_Box_3']:
    df[col] = df[col].replace({np.nan: 0, '+': 1, '-': 1, '$': 1}).infer_objects(copy=False)
df_features['jammer_penalty_counter'] = df['Jammer_Box_1'] + df['Jammer_Box_2'] + df['Jammer_Box_3']

df_features['blocker_penalty_counter'] = 0
for col in ['Pivot_Box_1','Pivot_Box_2','Pivot_Box_3',
            'Blocker_1_Box_1','Blocker_1_Box_2','Blocker_1_Box_3',
            'Blocker_2_Box_1','Blocker_2_Box_2','Blocker_2_Box_3',
            'Blocker_3_Box_1','Blocker_3_Box_2','Blocker_3_Box_3']:
    df[col] = df[col].replace({np.nan: 0, '+': 1, '-': 1, '$': 1}).infer_objects(copy=False)
    df_features['blocker_penalty_counter'] += df[col]
X_col += ['jammer_penalty_counter', 'blocker_penalty_counter']    

# jammer & blocker one hot encoding
blocker_cols = ['Pivot','Blocker_1', 'Blocker_2', 'Blocker_3']
df_encoded = build_encoded_matrix(
    df[blocker_cols], blocker_cols, 
    pd.concat([df['Pivot'],df['Blocker_1'],df['Blocker_2'],df['Blocker_3']]).unique()
)
X_col += [col for col in df_encoded.columns]
df_features = pd.concat([df_features, df_encoded], axis=1)

df_encoded = build_encoded_matrix(
    df['Jammer'], ['Jammer'], 
    df['Jammer'].unique()
)

for jammer in df['Jammer'].unique():
    df_features[str(jammer)+"_jammer"] = False
    df_features[str(jammer)+"_jammer"] = df_features[str(jammer)+"_jammer"] | df_encoded[jammer]
    X_col.append(str(jammer)+"_jammer")


  df_features['lead'] = df['LEAD'].replace({np.nan: False, "X": True}).infer_objects(copy=False)


In [6]:
from sklearn.linear_model import Ridge 
alpha = 20.0 # you can tune this 

In [7]:
ridge_model = Ridge(alpha=alpha)

In [25]:
X = df_features[X_col]
y = df_features['point_diff'] 

In [12]:
#from sklearn.preprocessing import OneHotEncoder
#from sklearn.linear_model import Ridge
#from sklearn.pipeline import Pipeline
#from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, KFold
#from sklearn.metrics import mean_squared_error, r2_score

In [26]:

# Cross-validation to gauge performance 
cv = KFold(n_splits=5, shuffle=True, random_state=42) 

neg_mse_scores = cross_val_score( ridge_model, X, y, cv=cv, scoring="neg_mean_squared_error" ) 

rmse_scores = np.sqrt(-neg_mse_scores) 

r2_scores = cross_val_score(ridge_model, X, y, cv=cv, scoring="r2") 
print("CV RMSE:", rmse_scores.mean(), "+/-", rmse_scores.std()) 
print("CV R^2:", r2_scores.mean(), "+/-", r2_scores.std()) 

#Fit the final model on all data:
ridge_model.fit(X, y) 


CV RMSE: 4.689981150635498 +/- 0.830718314778299
CV R^2: 0.053019248134171894 +/- 0.43534882423356996


In [27]:
coef_series = pd.Series( ridge_model.coef_, index=X_col ).sort_values(ascending=False) #coef_series.head(20), coef_series.tail(20)
coef_series #

trips                      1.972550
lead                       1.268568
711                        0.545353
27_jammer                  0.487598
100_jammer                 0.466506
603                        0.382082
517_jammer                 0.356210
36                         0.253502
222                        0.239666
13                         0.112763
27                         0.081897
555_jammer                -0.006516
48                        -0.034897
721_jammer                -0.042636
6                         -0.084088
100                       -0.113796
517                       -0.122891
83                        -0.151202
555                       -0.197479
35                        -0.244260
800                       -0.277977
721                       -0.388674
6_jammer                  -0.431254
blocker_penalty_counter   -0.490507
711_jammer                -0.829908
jammer_penalty_counter    -1.040142
dtype: float64

In [22]:
len(coef_series)

27

In [28]:
jammer_effects = coef_series[[c for c in coef_series.index if c.endswith("_jammer")]] 

In [30]:
jammer_effects

27_jammer     0.487598
100_jammer    0.466506
517_jammer    0.356210
555_jammer   -0.006516
721_jammer   -0.042636
6_jammer     -0.431254
711_jammer   -0.829908
dtype: float64

In [44]:
df_test

Unnamed: 0,lead,trips,jammer_penalty_counter,blocker_penalty_counter,721,6,13,711,222,100,...,36,83,48,517_jammer,27_jammer,721_jammer,100_jammer,555_jammer,6_jammer,711_jammer
0,0,0,0,0,True,0,0,0,0,True,...,True,0,0,True,0,0,0,0,0,0


In [33]:
len(X.columns)

26

np.float64(-4.6801483238309896)

In [48]:
df_test = pd.DataFrame([[0]*len(X_col)], columns=X_col)
for col in ['517_jammer', '721', '603', '36', '100']:
    df_test[col] = True
ridge_model.predict(df_test)[0] 

np.float64(-4.6801483238309896)

In [None]:
df_test = pd.DataFrame([[0]*len(X_col)], columns=X_col)
for col in ['517_jammer', '721', '603', '36', '100']:
    df_test[col] = True
ridge_model.predict(df_test)[0] 