In [3]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load dataset
df = pd.read_csv('./NHL Datasets/All Skaters 08-25.csv')
df_cup_data = pd.read_csv('./NHL Datasets/Stanley_Cup_Winners.csv')

team_mapping = { 
    "T.B": "TBL",
    "N.J": "NJD",
    "L.A": "LAK",
    "S.J": "SJS" 
}

# Standardize team names in both datasets
df['team'] = df['team'].replace(team_mapping)
df_cup_data['winning_team'] = df_cup_data['winning_team'].replace(team_mapping)

# Columns of interest, including 'season' for splitting purposes
columns_of_interest = [
    'season', 'name', 'team', 'position', 'situation', 'games_played', 'I_F_missedShots', 'I_F_blockedShotAttempts',
    'icetime', 'shifts', 'gameScore', 'onIce_xGoalsPercentage', 'offIce_xGoalsPercentage', 
    'onIce_corsiPercentage', 'I_F_xOnGoal', 'I_F_xGoals', 'I_F_primaryAssists', 
    'I_F_secondaryAssists', 'I_F_shotsOnGoal', 'I_F_shotAttempts', 'I_F_points', 'I_F_goals', 'I_F_savedShotsOnGoal', 
    'penalties', 'I_F_faceOffsWon', 'I_F_hits', 'I_F_takeaways', 'I_F_giveaways', 'I_F_lowDangerShots', 
    'I_F_mediumDangerShots', 'I_F_highDangerShots', 'I_F_lowDangerxGoals', 'I_F_mediumDangerxGoals', 'I_F_highDangerxGoals',
    'I_F_lowDangerGoals', 'I_F_mediumDangerGoals', 'I_F_highDangerGoals', 'I_F_dZoneGiveaways', 
    'I_F_dZoneShiftStarts', 'I_F_neutralZoneShiftStarts', 'faceoffsWon', 'faceoffsLost', 'penaltiesDrawn', 'shotsBlockedByPlayer'
]

# Exclude unwanted columns for evaluation
columns_to_exclude = ['games_played']  
analysis_columns = [col for col in columns_of_interest if col not in columns_to_exclude]

df = df[columns_of_interest]

# Filter for 5v5 situation
df = df[df['situation'] == '5on5']

# Handle missing values
df.fillna(0, inplace=True)

# Feature engineering
df['total_shots'] = df['I_F_shotsOnGoal'] + df['I_F_missedShots'] + df['I_F_blockedShotAttempts']

# List of stats to normalize per game (removing 'I_F_oZoneShiftStarts')
stats_per_game = [
    'I_F_highDangerShots', 'I_F_savedShotsOnGoal', 
    'gameScore', 'onIce_xGoalsPercentage', 'offIce_xGoalsPercentage',
    'onIce_corsiPercentage', 'I_F_xOnGoal', 'I_F_xGoals', 'I_F_primaryAssists', 
    'I_F_secondaryAssists', 'I_F_shotsOnGoal', 'I_F_shotAttempts', 'I_F_points', 
    'I_F_goals', 'I_F_savedShotsOnGoal', 'penalties', 'I_F_faceOffsWon', 'I_F_hits', 
    'I_F_takeaways', 'I_F_giveaways'
]

# Normalize stats per game
for stat in stats_per_game:
    df[stat] = df[stat] / df['games_played']

df['faceoffPercentage'] = df['faceoffsWon'] / df['faceoffsLost']

# Split data into train (<=2023) and test (2024)
df_train = df[df['season'] <= 2023]
df_test = df[df['season'] == 2024]

# Define features and target variable
features = [
    'gameScore', 'onIce_xGoalsPercentage', 'I_F_xGoals', 'I_F_primaryAssists', 'I_F_secondaryAssists',
    'I_F_shotsOnGoal', 'I_F_shotAttempts', 'I_F_goals', 'I_F_points', 'I_F_faceOffsWon', 'I_F_hits',
    'I_F_takeaways', 'I_F_giveaways', 'I_F_highDangerxGoals'
]

df_cup_winners = df_train[df_train['team'].isin(df_cup_data['winning_team'])].copy()  
df_cup_winners['winner'] = 1
df_train = df_train.copy()
df_train['winner'] = 0

# Oversample winners
df_winners_upsampled = df_cup_winners.sample(len(df_train), replace=True)

# Combine with regular teams
df_balanced = pd.concat([df_train, df_winners_upsampled])

X_train = df_balanced[features]
y_train = df_balanced['winner']

X_test = df_test[features]
y_test = df_test.get('winner', pd.Series([0] * len(df_test)))

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define models
models = {
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "TensorFlow": None  
}

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
grid_search = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
models["GradientBoosting"] = grid_search.best_estimator_

tf_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

tf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
tf_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

models["TensorFlow"] = tf_model

for name, model in models.items():
    if name != "TensorFlow":
        model.fit(X_train, y_train)

# Identify weaknesses
def identify_weakest_areas(team_df, league_df, top_n=10):
    numeric_cols = team_df.select_dtypes(include=['number']).columns
    relevant_cols = [col for col in numeric_cols if col in stats_per_game]

    team_avg = team_df[relevant_cols].mean()
    league_avg = league_df[relevant_cols].mean()

    weaknesses = (league_avg - team_avg).nlargest(top_n).index.tolist()
    return weaknesses

# Find trade target
def find_trade_target(df, team_name, weaknesses, models, percentile_threshold=50):
    team_players = df[df['team'] == team_name]
    df = df[(df['team'] != team_name) & (df['season'] == 2024)]

    for weakness in weaknesses:
        threshold_value = df[weakness].quantile(percentile_threshold / 100.0)
        df = df[df[weakness] >= threshold_value]

    if df.empty:
        return None

    df['predicted_fit'] = (models["GradientBoosting"].predict_proba(df[features])[:, 1] + 
                           models["RandomForest"].predict_proba(df[features])[:, 1] + 
                           models["LightGBM"].predict_proba(df[features])[:, 1]) / 3
    return df.nlargest(3, 'predicted_fit')

selected_team = "TOR"
weakest_stats = identify_weakest_areas(df[df['team'] == selected_team], df, top_n=5)
trade_recommendation = find_trade_target(df, selected_team, weakest_stats, models, percentile_threshold=50)

if trade_recommendation is not None:
    print(f"Weakest areas for {selected_team}: {weakest_stats}")
    print("Recommended trade targets:")
    print(trade_recommendation)
else:
    print("No suitable trade candidates found.")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m721/721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.6250 - loss: 0.6642 - val_accuracy: 0.0330 - val_loss: 1.0623
Epoch 2/20
[1m721/721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6298 - loss: 0.6494 - val_accuracy: 0.0283 - val_loss: 0.9732
Epoch 3/20
[1m721/721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6249 - loss: 0.6508 - val_accuracy: 0.0529 - val_loss: 0.9804
Epoch 4/20
[1m721/721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6341 - loss: 0.6438 - val_accuracy: 0.0364 - val_loss: 0.9537
Epoch 5/20
[1m721/721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6286 - loss: 0.6450 - val_accuracy: 0.0961 - val_loss: 0.9517
Epoch 6/20
[1m721/721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6314 - loss: 0.6440 - val_accuracy: 0.1216 - val_loss: 0.9400
Epoch 7/20
[1m721/721[0m 



In [7]:
# Save the trained TensorFlow model
tf_model.save('saved_model.keras')

# Function to load and make predictions
def load_and_predict(input_data):
    model = tf.keras.models.load_model('saved_model')
    prediction = model.predict(input_data)
    return prediction

# Create a minimal Flask web app
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    min_games_played = data.get('min_games_played', 0)  # Default to 0 if not provided
    selected_team = data.get('team', None)
    
    # Filter data based on input
    filtered_df = df[(df['games_played'] >= min_games_played)]
    if selected_team:
        filtered_df = filtered_df[filtered_df['team'] == selected_team]
    
    # Standardize and prepare for prediction
    X_input = scaler.transform(filtered_df[features])
    predictions = load_and_predict(X_input)

    return jsonify({'predictions': predictions.tolist()})

if __name__ == '__main__':
    app.run(debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (windowsapi)


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [11]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import joblib  # For saving and loading the scaler

# Load dataset
df = pd.read_csv('./NHL Datasets/All Skaters 08-25.csv')
df_cup_data = pd.read_csv('./NHL Datasets/Stanley_Cup_Winners.csv')

team_mapping = { 
    "T.B": "TBL",
    "N.J": "NJD",
    "L.A": "LAK",
    "S.J": "SJS" 
}

# Standardize team names in both datasets
df['team'] = df['team'].replace(team_mapping)
df_cup_data['winning_team'] = df_cup_data['winning_team'].replace(team_mapping)

# Columns of interest, including 'season' for splitting purposes
columns_of_interest = [
    'season', 'name', 'team', 'position', 'situation', 'games_played', 'I_F_missedShots', 'I_F_blockedShotAttempts',
    'icetime', 'shifts', 'gameScore', 'onIce_xGoalsPercentage', 'offIce_xGoalsPercentage', 
    'onIce_corsiPercentage', 'I_F_xOnGoal', 'I_F_xGoals', 'I_F_primaryAssists', 
    'I_F_secondaryAssists', 'I_F_shotsOnGoal', 'I_F_shotAttempts', 'I_F_points', 'I_F_goals', 'I_F_savedShotsOnGoal', 
    'penalties', 'I_F_faceOffsWon', 'I_F_hits', 'I_F_takeaways', 'I_F_giveaways', 'I_F_lowDangerShots', 
    'I_F_mediumDangerShots', 'I_F_highDangerShots', 'I_F_lowDangerxGoals', 'I_F_mediumDangerxGoals', 'I_F_highDangerxGoals',
    'I_F_lowDangerGoals', 'I_F_mediumDangerGoals', 'I_F_highDangerGoals', 'I_F_dZoneGiveaways', 
    'I_F_dZoneShiftStarts', 'I_F_neutralZoneShiftStarts', 'faceoffsWon', 'faceoffsLost', 'penaltiesDrawn', 'shotsBlockedByPlayer'
]

# Exclude unwanted columns for evaluation
columns_to_exclude = ['games_played']  
analysis_columns = [col for col in columns_of_interest if col not in columns_to_exclude]

df = df[columns_of_interest]

# Filter for 5v5 situation
df = df[df['situation'] == '5on5']

# Handle missing values
df.fillna(0, inplace=True)

# Feature engineering
df['total_shots'] = df['I_F_shotsOnGoal'] + df['I_F_missedShots'] + df['I_F_blockedShotAttempts']

# List of stats to normalize per game (removing 'I_F_oZoneShiftStarts')
stats_per_game = [
    'I_F_highDangerShots', 'I_F_savedShotsOnGoal', 
    'gameScore', 'onIce_xGoalsPercentage', 'offIce_xGoalsPercentage',
    'onIce_corsiPercentage', 'I_F_xOnGoal', 'I_F_xGoals', 'I_F_primaryAssists', 
    'I_F_secondaryAssists', 'I_F_shotsOnGoal', 'I_F_shotAttempts', 'I_F_points', 
    'I_F_goals', 'I_F_savedShotsOnGoal', 'penalties', 'I_F_faceOffsWon', 'I_F_hits', 
    'I_F_takeaways', 'I_F_giveaways'
]

# Normalize stats per game
for stat in stats_per_game:
    df[stat] = df[stat] / df['games_played']

df['faceoffPercentage'] = df['faceoffsWon'] / df['faceoffsLost']

# Split data into train (<=2023) and test (2024)
df_train = df[df['season'] <= 2023]
df_test = df[df['season'] == 2024]

# Define features and target variable
features = [
    'gameScore', 'onIce_xGoalsPercentage', 'I_F_xGoals', 'I_F_primaryAssists', 'I_F_secondaryAssists',
    'I_F_shotsOnGoal', 'I_F_shotAttempts', 'I_F_goals', 'I_F_points', 'I_F_faceOffsWon', 'I_F_hits',
    'I_F_takeaways', 'I_F_giveaways', 'I_F_highDangerxGoals'
]

df_cup_winners = df_train[df_train['team'].isin(df_cup_data['winning_team'])].copy()  
df_cup_winners['winner'] = 1
df_train = df_train.copy()
df_train['winner'] = 0

# Oversample winners
df_winners_upsampled = df_cup_winners.sample(len(df_train), replace=True)

# Combine with regular teams
df_balanced = pd.concat([df_train, df_winners_upsampled])

X_train = df_balanced[features]
y_train = df_balanced['winner']

X_test = df_test[features]
y_test = df_test.get('winner', pd.Series([0] * len(df_test)))

# Standardize features and save the scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save the scaler for future use
joblib.dump(scaler, 'scaler.pkl')

# Define models
models = {
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "TensorFlow": None  
}

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
grid_search = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
models["GradientBoosting"] = grid_search.best_estimator_

tf_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

tf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
tf_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

models["TensorFlow"] = tf_model

for name, model in models.items():
    if name != "TensorFlow":
        model.fit(X_train, y_train)

# Save the TensorFlow model for later use
tf_model.save('saved_model.keras')

# Identify weaknesses
def identify_weakest_areas(team_df, league_df, top_n=10):
    numeric_cols = team_df.select_dtypes(include=['number']).columns
    relevant_cols = [col for col in numeric_cols if col in stats_per_game]

    team_avg = team_df[relevant_cols].mean()
    league_avg = league_df[relevant_cols].mean()

    weaknesses = (league_avg - team_avg).nlargest(top_n).index.tolist()
    return weaknesses

# Find trade target
def find_trade_target(df, team_name, weaknesses, models, percentile_threshold=50):
    team_players = df[df['team'] == team_name]
    df = df[(df['team'] != team_name) & (df['season'] == 2024)]

    for weakness in weaknesses:
        threshold_value = df[weakness].quantile(percentile_threshold / 100.0)
        df = df[df[weakness] >= threshold_value]

    if df.empty:
        return None

    df['predicted_fit'] = (models["GradientBoosting"].predict_proba(df[features])[:, 1] + 
                           models["RandomForest"].predict_proba(df[features])[:, 1] + 
                           models["LightGBM"].predict_proba(df[features])[:, 1]) / 3
    return df.nlargest(3, 'predicted_fit')

selected_team = "TOR"
weakest_stats = identify_weakest_areas(df[df['team'] == selected_team], df, top_n=5)
trade_recommendation = find_trade_target(df, selected_team, weakest_stats, models, percentile_threshold=50)

if trade_recommendation is not None:
    print(f"Weakest areas for {selected_team}: {weakest_stats}")
    print("Recommended trade targets:")
    print(trade_recommendation)
else:
    print("No suitable trade candidates found.")

Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m721/721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6135 - loss: 0.6632 - val_accuracy: 0.0196 - val_loss: 0.9942
Epoch 2/20
[1m721/721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6273 - loss: 0.6521 - val_accuracy: 0.0559 - val_loss: 0.9786
Epoch 3/20
[1m721/721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6285 - loss: 0.6472 - val_accuracy: 0.0175 - val_loss: 0.9682
Epoch 4/20
[1m721/721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6321 - loss: 0.6433 - val_accuracy: 0.0219 - val_loss: 0.9791
Epoch 5/20
[1m721/721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6304 - loss: 0.6430 - val_accuracy: 0.0810 - val_loss: 0.9541
Epoch 6/20
[1m721/721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6300 - loss: 0.6422 - val_accuracy: 0.0250 - val_loss: 0.9964
Epoch 7/20
[1m721/721[0m [32m━━━━━━━

