In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("wyattowalsh/basketball")

print("Path to dataset files:", path)

In [None]:
import os
import pandas as pd


csv_folder = os.path.join(path, "csv")

# List all CSV files in the folder
csv_files = [f for f in os.listdir(csv_folder) if f.endswith('.csv')]

# Read each CSV file into a dictionary with filenames as keys
csv_data = {file: pd.read_csv(os.path.join(csv_folder, file)) for file in csv_files}

# Print the number of features and column names for each CSV file
for name, df in csv_data.items():
    print(f"{name} has {df.shape[1]} features:")
    print(df.columns.tolist())
    print("-" * 60)


In [None]:
# extract game.csv 
game_df = csv_data['game.csv']

if 'pts_home' in game_df.columns and 'pts_away' in game_df.columns:
    game_df['result'] = (game_df['pts_home'] > game_df['pts_away']).astype(int)
    print(game_df[['pts_home', 'pts_away', 'result']].head())
else:
    print("The required columns 'pts_home' and 'pts_away' are not present in the dataset.")

game_df['game_date'] = pd.to_datetime(game_df['game_date'])
game_df_1999_onwards = game_df[game_df['game_date'].dt.year >= 1999]
print(game_df_1999_onwards[['game_date', 'pts_home', 'pts_away', 'result']].head())


In [None]:
game_df_1999_onwards = game_df_1999_onwards.dropna(subset=['result'])

y = game_df_1999_onwards['result']

y.head()


In [None]:
# Extract play_by_play.csv data
play_by_play_df = csv_data['play_by_play.csv']

# Display the first few rows
play_by_play_df.head()



In [None]:
# Select relevant columns for win/loss statistics
new_table = game_df_1999_onwards[['game_id', 'game_date', 'team_abbreviation_home', 'team_abbreviation_away', 'result']]

# Rename columns
new_table.columns = ['Game ID', 'Game Date', 'Home Team', 'Away Team', 'Home Win']

# Display the first few rows
print(new_table.head())

In [None]:
from datetime import timedelta

# Define a function to calculate win rate for the last N games
def calculate_recent_win_rate(df, team_col, result_col, date_col, team, current_date, n):
    recent_games = df[(df[team_col] == team) & (df[date_col] < current_date)].sort_values(by=date_col, ascending=False).head(n)
    if recent_games.empty:
        return 0
    return recent_games[result_col].mean()

# Define a function to calculate win rate for the last N months
def calculate_recent_months_win_rate(df, team_col, result_col, date_col, team, current_date, months):
    start_date = current_date - timedelta(days=30 * months)
    recent_games = df[(df[team_col] == team) & (df[date_col] < current_date) & (df[date_col] >= start_date)]
    if recent_games.empty:
        return 0
    return recent_games[result_col].mean()


In [None]:

# Create a new table to store win rates
new_table_with_win_rates = []

for _, row in game_df_1999_onwards.iterrows():
    game_id = row['game_id']
    game_date = row['game_date']
    home_team = row['team_abbreviation_home']
    away_team = row['team_abbreviation_away']
    
    # Calculate recent 10-game win rate for home and away teams
    home_recent_10_win_rate = calculate_recent_win_rate(game_df_1999_onwards, 'team_abbreviation_home', 'result', 'game_date', home_team, game_date, 10)
    away_recent_10_win_rate = calculate_recent_win_rate(game_df_1999_onwards, 'team_abbreviation_away', 'result', 'game_date', away_team, game_date, 10)
    
    # Calculate recent 3-month win rate for home and away teams
    home_recent_3_months_win_rate = calculate_recent_months_win_rate(game_df_1999_onwards, 'team_abbreviation_home', 'result', 'game_date', home_team, game_date, 3)
    away_recent_3_months_win_rate = calculate_recent_months_win_rate(game_df_1999_onwards, 'team_abbreviation_away', 'result', 'game_date', away_team, game_date, 3)
    
    # Add data to the new table
    new_table_with_win_rates.append({
        'Game ID': game_id,
        'Game Date': game_date,
        'Home Team Recent 10 Win Rate': home_recent_10_win_rate,
        'Away Team Recent 10 Win Rate': away_recent_10_win_rate,
        'Home Team Recent 3 Months Win Rate': home_recent_3_months_win_rate,
        'Away Team Recent 3 Months Win Rate': away_recent_3_months_win_rate
    })

# Convert to DataFrame
new_table_with_win_rates_df = pd.DataFrame(new_table_with_win_rates)

# Display the first 30 rows
new_table_with_win_rates_df.head(30)

In [None]:
def days_since_last_game(df, team_col, date_col, team, current_date):
    """
    Calculate days since the last game
    """
    previous_games = df[(df[team_col] == team) & (df[date_col] < current_date)].sort_values(by=date_col, ascending=False)
    if previous_games.empty:
        return None  # Return None if no previous games
    last_game_date = previous_games.iloc[0][date_col]
    return (current_date - last_game_date).days


def number_of_games_in_last_7_days(df, team_col, date_col, team, current_date):
    """
    Count games in the last 7 days
    """
    start_date = current_date - pd.Timedelta(days=7)
    recent_games = df[(df[team_col] == team) & (df[date_col] < current_date) & (df[date_col] >= start_date)]
    return len(recent_games)


def is_back_to_back(df, team_col, date_col, team, current_date):
    """
    Check if the game is back-to-back
    """
    days_since_last = days_since_last_game(df, team_col, date_col, team, current_date)
    return days_since_last == 1  # Back-to-back if 1 day since last game

In [None]:
# Create a new table with game statistics
new_table_with_game_stats = []

for _, row in game_df_1999_onwards.iterrows():
    game_id = row['game_id']
    game_date = row['game_date']
    home_team = row['team_abbreviation_home']
    away_team = row['team_abbreviation_away']
    
    # Calculate home team stats
    home_days_since_last_game = days_since_last_game(game_df_1999_onwards, 'team_abbreviation_home', 'game_date', home_team, game_date)
    home_games_last_7_days = number_of_games_in_last_7_days(game_df_1999_onwards, 'team_abbreviation_home', 'game_date', home_team, game_date)
    home_is_back_to_back = is_back_to_back(game_df_1999_onwards, 'team_abbreviation_home', 'game_date', home_team, game_date)
    
    # Calculate away team stats
    away_days_since_last_game = days_since_last_game(game_df_1999_onwards, 'team_abbreviation_away', 'game_date', away_team, game_date)
    away_games_last_7_days = number_of_games_in_last_7_days(game_df_1999_onwards, 'team_abbreviation_away', 'game_date', away_team, game_date)
    away_is_back_to_back = is_back_to_back(game_df_1999_onwards, 'team_abbreviation_away', 'game_date', away_team, game_date)
    
    # Add stats to the new table
    new_table_with_game_stats.append({
        'Game ID': game_id,
        'Game Date': game_date,
        'Home Days Since Last Game': home_days_since_last_game,
        'Home Games Last 7 Days': home_games_last_7_days,
        'Home Is Back-to-Back': home_is_back_to_back,
        'Away Days Since Last Game': away_days_since_last_game,
        'Away Games Last 7 Days': away_games_last_7_days,
        'Away Is Back-to-Back': away_is_back_to_back
    })

# Convert to DataFrame
new_table_with_game_stats_df = pd.DataFrame(new_table_with_game_stats)

# Display the first few rows
new_table_with_game_stats_df.head()

In [None]:
# Filter data from the year 2000 onwards
filtered_data_2000_onwards = new_table_with_game_stats_df[new_table_with_game_stats_df['Game Date'].dt.year >= 2000]

# Save the filtered data to a CSV file
filtered_data_2000_onwards.to_csv('game_stats_2000_onwards.csv', index=False)