# Basic Data PreProcessing

In [None]:
import pandas as pd

# Load the datasets
nba_data = pd.read_csv('nhl__season_data.csv.csv')
player_mvp_stats = pd.read_csv('skaters_nhl_data.csv')


In [None]:
team_mapping = {
    'ANA': 'ANA', 'BOS': 'BOS', 'CGY': 'CGY', 'DET': 'DET', 'EDM': 'EDM', 'FLA': 'FLA', 'LAK': 'LAK', 'MTL': 'MTL',
    'NYR': 'NYR', 'OTT': 'OTT', 'PHI': 'PHI', 'PIT': 'PIT', 'SJS': 'SJS', 'TBL': 'TBL', 'TOR': 'TOR', 'VAN': 'VAN'
}


In [None]:
player_mvp_stats['Tm'] = player_mvp_stats['Tm'].map(team_mapping)


In [None]:
combined_data = pd.merge(nba_data, player_mvp_stats, left_on='team', right_on='Tm', how='inner')


In [None]:
combined_data.head()


Unnamed: 0,date,team,team_opp,goals_scored,goals_scored_opp,shots_on_goal,shots_on_goal_opp,power_play_goals,penalty_kill_goals,faceoff_win_percentage,...,OffIce_F_xGoals,OffIce_A_xGoals,OffIce_F_shotAttempts,OffIce_A_shotAttempts,xGoalsForAfterShifts,xGoalsAgainstAfterShifts,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts
0,2021-01-13,EDM,FLA,3,5,33,21,0,0,42.8,...,3.78,3.56,22,36,0.0,0.0,0,0,0,0
1,2021-01-13,EDM,FLA,3,5,33,21,0,0,42.8,...,108.26,104.2,1819,2098,0.0,0.0,0,0,0,0
2,2021-01-13,EDM,FLA,3,5,33,21,0,0,42.8,...,78.87,74.04,1596,1633,0.69,0.94,13,28,10,19
3,2021-01-13,EDM,FLA,3,5,33,21,0,0,42.8,...,0.0,0.0,0,0,0.0,0.0,0,0,0,0
4,2021-01-13,EDM,FLA,3,5,33,21,0,0,42.8,...,12.7,1.08,121,18,0.0,0.26,0,2,0,2


In [None]:
# Select and display only the 'Player' and 'Tm' columns
player_and_team_data = combined_data[['Player', 'Tm']]
print(player_and_team_data)


               Player   Tm
0       Alex Chiasson  EDM
1       Alex Chiasson  EDM
2       Alex Chiasson  EDM
3       Alex Chiasson  EDM
4       Alex Chiasson  EDM
...               ...  ...
29935  Joel Edmundson  MTL
29936  Joel Edmundson  MTL
29937  Joel Edmundson  MTL
29938  Joel Edmundson  MTL
29939  Joel Edmundson  MTL

[29940 rows x 2 columns]


In [None]:
# Select and display only the 'team' and 'team_opp' columns
team_and_team_opp_data = combined_data[['team', 'team_opp']]
print(team_and_team_opp_data)


      team team_opp
0      EDM      FLA
1      EDM      FLA
2      EDM      FLA
3      EDM      FLA
4      EDM      FLA
...    ...      ...
29935  MTL      VAN
29936  MTL      VAN
29937  MTL      VAN
29938  MTL      VAN
29939  MTL      VAN

[29940 rows x 2 columns]


# Adding Weights

In [None]:
# Get all player names

unique_players = combined_data['Player'].unique()

pd.set_option('display.max_rows', None)

print(unique_players)

pd.reset_option('display.max_rows')


['Alex Chiasson' 'Ryan McLeod' 'Connor McDavid' 'Tyler Ennis'
 'Ryan Nugent-Hopkins' 'Ethan Bear' 'Patrick Russell' 'William Lagesson'
 'Caleb Jones' 'Dominik Kahun' 'Dmitry Kulikov' 'Evan Bouchard'
 'Zack Kassian' 'Leon Draisaitl' 'Kris Russell' 'Jujhar Khaira'
 'Joakim Nygard' 'James Neal' 'Slater Koekkoek' 'Devin Shore'
 'Adam Larsson' 'Tyson Barrie' 'Darnell Nurse' 'Josh Archibald'
 'Jesse Puljujarvi' 'Gaetan Haas' 'Kailer Yamamoto' 'Kyle Turris'
 'Nikita Gusev' 'MacKenzie Weegar' 'Alex Wennberg' 'Brandon Montour'
 'Radko Gudas' 'Brady Keeper' 'Kevin Connauton' 'Eetu Luostarinen'
 'Mason Marchment' 'Juho Lammikko' 'Aleksi Heponiemi' 'Noah Juulsen'
 'Patric Hornqvist' 'Owen Tippett' 'Aleksander Barkov' 'Anthony Duclair'
 'Gustav Forsling' 'Noel Acciari' 'Matt Kiersted' 'Lucas Wallmark'
 'Ryan Lomberg' 'Carter Verhaeghe' 'Sam Bennett' 'Frank Vatrano'
 'Anton Stralman' 'Grigori Denisenko' 'Keith Yandle' 'Aaron Ekblad'
 'Markus Nutivaara' 'Jonathan Huberdeau' 'Par Lindholm' 'Brandon Ca

In [None]:
# Sample mapping of player names to their corresponding weights
player_weights = {
    'Connor McDavid': 0.98,
    'Leon Draisaitl': 0.97,
    'Nathan MacKinnon': 0.96,
    'Auston Matthews': 0.96,
    'David Pastrnak': 0.95,
    'Cale Makar': 0.95,
    'Alex Ovechkin': 0.94,
    'Sidney Crosby': 0.94,
    'Andrei Vasilevskiy': 0.93,
    'Igor Shesterkin': 0.93,
    'Victor Hedman': 0.92,
    'Mikko Rantanen': 0.92,
    'Brad Marchand': 0.91,
    'Mitch Marner': 0.91,
    'Patrick Kane': 0.90,
    'Jonathan Huberdeau': 0.90,
    'Artemi Panarin': 0.89,
    'Adam Fox': 0.89,
    'Jack Eichel': 0.88,
    'Aleksander Barkov': 0.88,
    'Kirill Kaprizov': 0.88,
    'Matthew Tkachuk': 0.87,
    'Steven Stamkos': 0.87,
    'Roman Josi': 0.86,
    'Mark Scheifele': 0.86,
    'Sebastian Aho': 0.85,
    'John Tavares': 0.85,
    'Mika Zibanejad': 0.84,
    'Kyle Connor': 0.84,
    'Seth Jones': 0.83,
    'Jake Guentzel': 0.83,
    'Johnny Gaudreau': 0.82,
    'Elias Pettersson': 0.82,
    'Quinn Hughes': 0.81,
    'Evgeni Malkin': 0.81,
    'Ryan O Reilly': 0.80,
    'Gabriel Landeskog': 0.80,
    'Anze Kopitar': 0.79,
    'Darnell Nurse': 0.79,
    'Shea Theodore': 0.78,
    'Tyler Seguin': 0.78,
    'Brent Burns': 0.77,
    'Patrik Laine': 0.77,
    'Dougie Hamilton': 0.76,
    'Pierre-Luc Dubois': 0.76,
    'J.T. Miller': 0.75,
    'Tomas Hertl': 0.75,
    'William Nylander': 0.74,
    'Alex Pietrangelo': 0.74,
    'Zach Werenski': 0.73,
    'Ryan Nugent-Hopkins': 0.73,
    'Timo Meier': 0.72,
    'Max Pacioretty': 0.72,
    'Filip Forsberg': 0.71,
    'Blake Wheeler': 0.71,
    'John Carlson': 0.70,
    'Kevin Fiala': 0.70,
    'Nick Suzuki': 0.69,
    'Brock Boeser': 0.69,
    'Travis Konecny': 0.68,
    'Anders Lee': 0.68,
    'Dylan Larkin': 0.67,
    'Sam Reinhart': 0.67
}

# Add player weights to combined_data
combined_data['player_weight'] = combined_data['Player'].map(player_weights)


In [None]:
default_weight = 0.5

combined_data['player_weight'] = combined_data['Player'].map(player_weights).fillna(default_weight)

print(combined_data[['Player', 'player_weight']])


               Player  player_weight
0       Alex Chiasson            0.5
1       Alex Chiasson            0.5
2       Alex Chiasson            0.5
3       Alex Chiasson            0.5
4       Alex Chiasson            0.5
...               ...            ...
29935  Joel Edmundson            0.5
29936  Joel Edmundson            0.5
29937  Joel Edmundson            0.5
29938  Joel Edmundson            0.5
29939  Joel Edmundson            0.5

[29940 rows x 2 columns]


In [None]:

# weight for Sidney Crosby
crosby_weight = player_weights.get('Sidney Crosby', "Player not found")

# weight for Nathan MacKinnon
mackinnon_weight = player_weights.get('Nathan MacKinnon', "Player not found")

# Print the weights
print(f"Weight for Sidney Crosby: {crosby_weight}")
print(f"Weight for Nathan MacKinnon: {mackinnon_weight}")

Weight for Sidney Crosby: 0.94
Weight for Nathan MacKinnon: 0.96


In [None]:
team_weights = {
  'goals_scored': 0.833,
  'shots_on_goal': 1.0,
  'save_percentage': 0.667,
  'faceoff_win_percentage': 0.417,
  'power_play_goals': 1.0,
  'power_play_opportunities': 0.917,
  'shots_on_goal_opp': 0.75,
  'penalty_kill_goals': 0.75
}


for col, weight in team_weights.items():
  combined_data[col] = combined_data[col] * weight

print(combined_data.head())


         date team team_opp  goals_scored  goals_scored_opp  shots_on_goal  \
0  2021-01-13  EDM      FLA         2.499                 5           33.0   
1  2021-01-13  EDM      FLA         2.499                 5           33.0   
2  2021-01-13  EDM      FLA         2.499                 5           33.0   
3  2021-01-13  EDM      FLA         2.499                 5           33.0   
4  2021-01-13  EDM      FLA         2.499                 5           33.0   

   shots_on_goal_opp  power_play_goals  penalty_kill_goals  \
0              15.75               0.0                 0.0   
1              15.75               0.0                 0.0   
2              15.75               0.0                 0.0   
3              15.75               0.0                 0.0   
4              15.75               0.0                 0.0   

   faceoff_win_percentage  ...  OffIce_A_xGoals  OffIce_F_shotAttempts  \
0                 17.8476  ...             3.56                     22   
1         

In [None]:
print(combined_data['won'])


0        False
1        False
2        False
3        False
4        False
         ...  
29935    False
29936    False
29937    False
29938    False
29939    False
Name: won, Length: 29940, dtype: bool


In [None]:
combined_data['won'] = combined_data['won'].astype(int)


In [None]:
print(combined_data['won'])


0        0
1        0
2        0
3        0
4        0
        ..
29935    0
29936    0
29937    0
29938    0
29939    0
Name: won, Length: 29940, dtype: int64


In [None]:
print(combined_data['won'])


0        0
1        0
2        0
3        0
4        0
        ..
29935    0
29936    0
29937    0
29938    0
29939    0
Name: won, Length: 29940, dtype: int64


# Neural Network Feedforward Functional API

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import regularizers

# Extract features and target variable
X = combined_data[['goals_scored', 'shots_on_goal', 'save_percentage', 'faceoff_win_percentage', 'power_play_goals', 'penalty_kill_goals', 'power_play_opportunities', 'shots_on_goal_opp', 'player_weight']]
y = combined_data['won']  # Use the 'won' column as the target variable

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Data preprocessing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Scale the features
X_test_scaled = scaler.transform(X_test)

# Define the model architecture using the Functional API
num_features = X_train_scaled.shape[1]

input_layer = tf.keras.layers.Input(shape=(num_features,))
hidden1 = tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.5))(input_layer)
dropout1 = tf.keras.layers.Dropout(0.3)(hidden1)
hidden2 = tf.keras.layers.Dense(64, activation='relu')(dropout1)
dropout2 = tf.keras.layers.Dropout(0.3)(hidden2)
hidden3 = tf.keras.layers.Dense(32, activation='relu')(dropout2)
player_weight_input = tf.keras.layers.Input(shape=(1,))
merged = tf.keras.layers.Concatenate()([hidden3, player_weight_input])
output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(merged)

model = tf.keras.models.Model(inputs=[input_layer, player_weight_input], outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Train the model
model.fit([X_train_scaled, X_train['player_weight']], y_train,
          epochs=1, batch_size=4,
          validation_data=([X_test_scaled, X_test['player_weight']], y_test))

# Prediction
# For new predictions, you would preprocess the new data (let's call it new_data) in the same way as X_train
# Example:
# new_data_scaled = scaler.transform(new_data_features)
# predictions = model.predict([new_data_scaled, new_data_player_weight])


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 9)]                  0         []                            
                                                                                                  
 dense (Dense)               (None, 128)                  1280      ['input_1[0][0]']             
                                                                                                  
 dropout (Dropout)           (None, 128)                  0         ['dense[0][0]']               
                                                                                                  
 dense_1 (Dense)             (None, 64)                   8256      ['dropout[0][0]']             
                                                                                              

<keras.src.callbacks.History at 0x7f1232776dd0>

In [None]:
import numpy as np


In [None]:
# Function to retrieve team data
def get_team_data(team_name, combined_data):
    # Filter the DataFrame for rows matching the team name
    team_data_filtered = combined_data[(combined_data['team'] == team_name) | (combined_data['team_opp'] == team_name)]

    if team_data_filtered.empty:
        raise ValueError(f"No data found for team: {team_name}")

    numeric_columns = ['goals_scored', 'shots_on_goal', 'save_percentage', 'faceoff_win_percentage', 'power_play_goals', 'power_play_opportunities', 'shots_on_goal_opp', 'penalty_kill_goals', 'player_weight']

    team_data_numeric = team_data_filtered[numeric_columns]
    team_data_mean = team_data_numeric.mean()
    average_player_weight = combined_data[combined_data['Tm'] == team_name]['player_weight'].mean()
    return team_data_mean, average_player_weight

# Function to predict game outcome
def predict_game_outcome(team1_name, team2_name, combined_data, scaler, model):
    team1_data, team1_player_weight = get_team_data(team1_name, combined_data)
    team2_data, team2_player_weight = get_team_data(team2_name, combined_data)

    #preparing each teams features
    features_team1 = np.array([team1_data])
    features_team2 = np.array([team2_data])

    # Scale features
    features_team1_scaled = scaler.transform(features_team1)
    features_team2_scaled = scaler.transform(features_team2)

    # Reshape player weight for input
    team1_weight = np.array([[team1_player_weight]])
    team2_weight = np.array([[team2_player_weight]])

    # Predict using the model
    prediction_team1 = model.predict([features_team1_scaled, team1_weight])
    prediction_team2 = model.predict([features_team2_scaled, team2_weight])

    # Interpret the prediction
    if prediction_team1[0] > 0.5:
        winner = team1_name
    else:
        winner = team2_name

    return f"{winner} is predicted to win"

# input team you want a match prediction for
team1 = input("Enter team one: ")
team2 = input("Enter team two: ")

prediction = predict_game_outcome(team1, team2, combined_data, scaler, model)
print(prediction)

Enter team one: MTL
Enter team two: BOS
MTL is predicted to win


