In [13]:
#imports 
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import numpy as np


In [14]:
import pandas as pd

# load datasets
games_data = pd.read_csv("games.csv")
teams_data = pd.read_csv("team.csv")

# Define columns to be selected
selected_columns_games = ['date', 'home_team_score', 'visitor_team_score', 'home_team_id', 'visitor_team_id']

# Merge datasets
merged_data = pd.merge(games_data[selected_columns_games], teams_data, how='inner', left_on='home_team_id', right_on='team_id')

# Ensure 'date' column is in datetime format and sort by date and keep relevant data(last 3-4 seasons)
merged_data['date'] = pd.to_datetime(merged_data['date']).dt.tz_localize(None)

# View the final dataframe
merged_data

Unnamed: 0,date,home_team_score,visitor_team_score,home_team_id,visitor_team_id,id,first_name,height_feet,height_inches,last_name,position,team,weight_pounds,team_id
0,2019-01-30,126,94,2,4,47,Jabari,,,Bird,G,BOS,,2
1,2019-01-30,126,94,2,4,494,Michael,,,Smith,,BOS,,2
2,2019-01-30,126,94,2,4,47,Jabari,,,Bird,G,BOS,,2
3,2019-01-30,126,94,2,4,494,Michael,,,Smith,,BOS,,2
4,2019-01-30,126,94,2,4,669,John,,,Bagley,,BOS,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12555699,2024-03-31,0,0,28,23,56677805,Javon,,,Freeman-Liberty,G,TOR,,28
12555700,2024-03-31,0,0,28,23,38017717,Ron,,,Harper Jr.,F,TOR,,28
12555701,2024-03-31,0,0,28,23,373,Jakob,7.0,0.0,Poeltl,C,TOR,230.0,28
12555702,2024-03-31,0,0,28,23,416,Pascal,6.0,9.0,Siakam,F,TOR,230.0,28


In [None]:
def calculate_date_features(merged_data):
    # Ensure 'date' column is in datetime format and sort by date
    merged_data = merged_data.sort_values(by='date')
    
    # Filter out the games prior to 2020-01-01 (or any other start date you prefer)
    start_date = pd.to_datetime("2020-01-01")
    merged_data = merged_data[merged_data['date'] >= start_date]

    # Add a season column to indicate the season year
    merged_data['season'] = merged_data['date'].dt.year

    # Outcome column where 1 = home win, 0 = visitor win
    merged_data['outcome'] = (merged_data['home_team_score'] > merged_data['visitor_team_score']).astype(int)

    # Initialize columns for running win totals, games played, and win percentages
    merged_data['home_running_wins'] = 0
    merged_data['visitor_running_wins'] = 0
    merged_data['home_running_games'] = 0
    merged_data['visitor_running_games'] = 0
    merged_data['home_seasonal_win_percentage'] = 0
    merged_data['visitor_seasonal_win_percentage'] = 0

    # Calculate running win totals for home and visitor teams
    merged_data['home_running_wins'] = merged_data.groupby('home_team_id')['outcome'].cumsum()
    merged_data['visitor_running_wins'] = merged_data.groupby('visitor_team_id')['outcome'].cumsum()

    # Calculate cumulative count of games played for home and visitor teams
    merged_data['home_running_games'] = merged_data.groupby('home_team_id').cumcount() + 1
    merged_data['visitor_running_games'] = merged_data.groupby('visitor_team_id').cumcount() + 1

    # Calculate seasonal win percentage for home and visitor teams
    merged_data['home_seasonal_win_percentage'] = merged_data['home_running_wins'] / merged_data['home_running_games']
    merged_data['visitor_seasonal_win_percentage'] = merged_data['visitor_running_wins'] / merged_data['visitor_running_games']

    # Add playoff hunt status: 1 if win percentage >= 50%, else 0
    merged_data['home_in_playoff_hunt'] = (merged_data['home_seasonal_win_percentage'] >= 0.500).astype(int)
    merged_data['visitor_in_playoff_hunt'] = (merged_data['visitor_seasonal_win_percentage'] >= 0.500).astype(int)

    # Optionally drop columns you no longer need
    merged_data = merged_data.drop(['home_running_wins', 'visitor_running_wins', 'home_running_games', 'visitor_running_games'], axis=1)
    merged_data['date'] = pd.to_datetime(merged_data['date'])
    season_end_date = pd.to_datetime('2025-04-15')

    # 1. Binary classification for Playoff vs Regular Season (Playoffs start in April)
    merged_data['is_playoff'] = merged_data['date'].dt.month.apply(lambda x: 1 if x >= 4 else 0)

    # 2. Calculate days remaining in the season
    merged_data['days_remaining'] = (season_end_date - merged_data['date']).dt.days

    # 3. Weights based on how close the game is to the end of the regular season
    merged_data['late_season_weight'] = np.maximum(0, 1 - (merged_data['days_remaining'] / 365))  # Normalize by 365 days

    # 4. Example: Assume a team is in the playoff hunt if their win percentage is >= 0.500
    merged_data['team_win_percentage'] = merged_data['team_wins'] / merged_data['team_games_played']
    merged_data['in_playoff_hunt'] = merged_data['team_win_percentage'].apply(lambda x: 1 if x >= 0.500 else 0)

    # 5. Combining the late-season and playoff hunt feature
    merged_data['late_playoff_weight'] = merged_data['late_season_weight'] * merged_data['in_playoff_hunt']

    return merged_data

Predictor for player statistics and team statistics

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
# TODO: test the class .95.

# Define the Dataset class
class PlayerDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features.values, dtype=torch.float32)
        self.targets = torch.tensor(targets.values, dtype=torch.float32).view(-1, 1)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

# Define the Model class
class PlayerStatsPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(PlayerStatsPredictor, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Helper function to prepare the data for model training
def prepare_data(player_data, merged_data, selected_features, target_statistic):
    # Merge player stats with team-based features (e.g., from merged_data)
    merged_features = pd.merge(player_data, merged_data, left_on='team_id', right_on='home_team_id', how='inner')
    
    # Split the features and target
    X = merged_features[selected_features]
    y = merged_features[target_statistic]
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test

Training Baseline model with features

In [None]:


# Create an instance of the model and initialize parameters
# TODO: Implement training model, make sure player data contains neccessary features, make sure merged_df has necessary date features, and then create the model, figure out target stat(win outcome percentage accuracy)
input_size = len(selected_features) + len(merged_data.columns)  # player stats + team/date features
hidden_size = 64
output_size = 1
batch_size = 64
num_epochs = 10
learning_rate = 0.001

# Assuming 'player_data' contains the player stats and 'merged_data' has the features
X_train, X_test, y_train, y_test = prepare_data(player_data, merged_data, selected_features, target_statistic)

# Create datasets and dataloaders
train_dataset = PlayerDataset(X_train, y_train)
test_dataset = PlayerDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize the model, loss function, and optimizer
model = PlayerStatsPredictor(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    average_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}')

KeyError: 'team_wins'

Model Evaluation

In [None]:
from sklearn.metrics import mean_squared_error

# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    all_predictions = []
    all_targets = []
    for inputs, targets in test_loader:
        predictions = model(inputs)
        all_predictions.extend(predictions.numpy())
        all_targets.extend(targets.numpy())

# Calculate Mean Squared Error on the test set
mse = mean_squared_error(all_targets, all_predictions)
print(f'Mean Squared Error on Test Set: {mse:.4f}')


Test player prediction on single player Ex. Lebron James

In [None]:
# Function to predict player statistics for a given player
def predict_player_stat(player_name, player_data, model, selected_features):
    player_names = player_name.split()
    player_features = player_data[(player_data['player.first_name'] == player_names[0]) & 
                                  (player_data['player.last_name'] == player_names[1])][selected_features].values    
    player_features_tensor = torch.tensor(player_features, dtype=torch.float32)
    
    model.eval()
    with torch.no_grad():
        predicted_stat = model(player_features_tensor)
    
    predicted_stat = predicted_stat.numpy()
    return predicted_stat

# Example: Predicting player statistics for a given player
player_name = 'LeBron James'  # Adjust this with the player's name you want to predict for
predicted_stats = predict_player_stat(player_name, player_data, model, selected_features)
print(f'Predicted Stats for {player_name}: {predicted_stats[0][0]:.2f}')
