In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import random

In [3]:
# Load dataset
data = pd.read_csv('train_data.csv')


data = data.fillna(0)
data['is_night_game'] = data['is_night_game'].astype(int)
data['home_team_win'] = data['home_team_win'].astype(int)


for column in data.select_dtypes(include=[np.number]).columns:
    # Fill NaN with the column mean
    mean_values = 0
    if column.startswith("home") and (type(data[column]) == 'float64' or type(data[column]) == 'float32') :
        mean_values = data.groupby('home_team_abbr')[column].transform('mean')
    if column.startswith("away") and (type(data[column]) == 'float64' or type(data[column]) == 'float32') :
        mean_values = data.groupby('away_team_abbr')[column].transform('mean')
    data[column].fillna(mean_values)

In [None]:
# Dictionary to track team performance
team_performance = {team: {'games_played': 0, 'games_won': 0} for team in teams}

# Iterate through the data for the 2023 season to update team performance
for i, row in data.iterrows():
    home_team = row['home_team_abbr']
    away_team = row['away_team_abbr']
    home_team_win = row['home_team_win']

    # Update games played
    team_performance[home_team]['games_played'] += 1
    team_performance[away_team]['games_played'] += 1

    # Update games won
    if home_team_win == 1:
        team_performance[home_team]['games_won'] += 1
    else:
        team_performance[away_team]['games_won'] += 1

# Calculate winning rate for each team
team_winning_rate = {
    team: (performance['games_won'] / performance['games_played']) if performance['games_played'] > 0 else 0
    for team, performance in team_performance.items()
}

# Print the winning rate for each team
for team, winning_rate in team_winning_rate.items():
    print(f"Team: {team}, Winning Rate: {winning_rate:.2f}")

data['home_win_rate'] = 0.0
data['away_win_rate'] = 0.0
for i, row in data.iterrows():
    home_team = row['home_team_abbr']
    away_team = row['away_team_abbr']
    data.loc[i, 'home_win_rate'] = team_winning_rate[home_team]
    data.loc[i, 'away_win_rate'] = team_winning_rate[away_team]


In [8]:
'''data['matchup'] = data.apply(
    lambda row: f"{min(row['away_team'], row['home_team'])} vs {max(row['away_team'], row['home_team'])}", axis=1
)'''

# Step 2: Calculate home team's win rate against away team
# Group by 'home_team' and 'away_team' to calculate total games and wins
matchup_stats = (
    data.groupby(['home_team_abbr', 'away_team_abbr', 'season'])
    .agg(total_games=('home_team_win', 'size'), total_home_wins=('home_team_win', 'sum'))
    .reset_index()
)

# Add the win rate column
matchup_stats['home_team_win_rate'] = matchup_stats['total_home_wins'] / matchup_stats['total_games']

# Step 3: Merge win rate back into the original DataFrame
data = data.merge(
    matchup_stats[['home_team_abbr', 'away_team_abbr', 'season', 'home_team_win_rate']],
    on=['home_team_abbr', 'away_team_abbr', 'season'],
    how='left'
)
print(data['home_team_win_rate'])

0        1.000000
1        0.333333
2        1.000000
3        0.500000
4        0.500000
           ...   
11062    1.000000
11063    0.333333
11064    0.666667
11065    0.142857
11066    0.666667
Name: home_team_win_rate, Length: 11067, dtype: float64


In [None]:
data.sample(frac=1, random_state=42).reset_index(drop=True)