In [2]:
# Download necessary libraries
import pandas as pd 
import numpy as np 
from datetime import date 

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold

## Data Preprocessing

In [None]:
# Prep initial dataset before divide into 3 datasets
# Read csv file of historic game statistics
team_statistics_original = pd.read_csv('TeamStatistics.csv')
team_statistics = team_statistics_original.copy()

# Remodel the gameDate attribute into date data type
team_statistics['gameDate'] = pd.to_datetime(team_statistics['gameDate']).dt.date

# Remove the unnecessary attributes. These attributes might be recorded later
# so they dont have sufficient data to train model
team_statistics = team_statistics.drop(columns={'q1Points', 'q2Points', 'q3Points', 'q4Points',
                                              'benchPoints', 'biggestLead', 'biggestScoringRun',
                                              'leadChanges', 'pointsFastBreak', 'pointsFromTurnovers',
                                              'pointsInThePaint', 'pointsSecondChance', 'timesTied',
                                              'timeoutsRemaining', 'coachId', 'opponentTeamCity',
                                              'teamCity', 'foulsPersonal'})

# Fill up attributes that have NaN values with average.
# These attributes dont have many NaN values, so this wont skew the data
columns_to_fill = ['assists', 'blocks', 'steals',
                   'fieldGoalsAttempted', 'fieldGoalsMade', 'fieldGoalsPercentage',
                   'threePointersAttempted', 'threePointersMade', 'threePointersPercentage',
                   'freeThrowsAttempted', 'freeThrowsMade', 'freeThrowsPercentage',
                   'reboundsDefensive', 'reboundsOffensive', 'reboundsTotal',
                   'turnovers', 'plusMinusPoints', 'numMinutes']

for col in columns_to_fill:
    team_statistics[col] = team_statistics[col].fillna(team_statistics[col].mean())

# Ensure all analysed attributes are numeric
team_statistics[['teamScore', 'opponentScore']] = team_statistics[['teamScore', 'opponentScore']].apply(pd.to_numeric, errors = 'coerce')

# Create a plusMinusPoints attribute, which represents the scoring difference between home and away team
team_statistics['plusMinusPoints'] = team_statistics['plusMinusPoints'].fillna(
    team_statistics['teamScore'] - team_statistics['opponentScore']
)

# Make sure gameId is a string
team_statistics['gameId'] = team_statistics['gameId'].astype(str)

# Extract season code: 2nd and 3rd characters from gameId
team_statistics['season'] = team_statistics['gameId'].str[1:3].astype(int)


# Divide the initial dataset into 2 datasets, based on type of games
# Dataset 1: Regular season and postseason games (gameId starts with '2', '3' or '4')
game_set = team_statistics[team_statistics['gameId'].str.startswith(('2', '3', '4'))]

# Dataset 2: Other games, such as preseason, all-star (gameId does not start with '2', '3', or '4')
other_set = team_statistics[~team_statistics['gameId'].str.startswith(('2', '3', '4'))]


game_set = game_set.sort_values(by = ['season', 'teamId', 'gameDate'])

# Calculate cumulative wins before each game
game_set['seasonWins'] = (
    game_set.groupby(['season', 'teamId'])['win']
    .cumsum()
    .shift(1)
    .fillna(0)
    .astype(int)
)

# Calculate losses as (game played - win) up to that game
game_set['seasonLosses'] = game_set['seasonWins'].copy() # Initialize

game_set['seasonGames'] = (
    game_set.groupby(['season', 'teamId']).cumcount()
)

game_set['seasonLosses'] = game_set['seasonGames'] - game_set['seasonWins']


# Calculate average for team going into a game
stats_to_roll_1 = ['teamScore', 'opponentScore', 'assists', 'blocks', 'steals',
          'freeThrowsAttempted', 'reboundsTotal', 'turnovers', 'plusMinusPoints',
          'fieldGoalsAttempted', 'threePointersAttempted'] # Stats where we round up to 1 decimal number

stats_to_roll_3 = ['fieldGoalsPercentage', 'threePointersPercentage', 'freeThrowsPercentage'] # Stats where we round up to 3 decimal number

game_set[["avg_" + stat for stat in stats_to_roll_1 ]] = (
    game_set
    .groupby(['season', 'teamId'])[stats_to_roll_1]
    .transform(lambda x: x.shift().expanding().mean())
    .round(1)
)

game_set[["avg_" + stat for stat in stats_to_roll_3 ]] = (
    game_set
    .groupby(['season', 'teamId'])[stats_to_roll_3]
    .transform(lambda x: x.shift().expanding().mean())
    .round(3)
)


# When "seasonLosses" attribute is negative, set both 'seasonLosses' and 'seasonWins' to 0
game_set.loc[game_set['seasonLosses'] < 0, ['seasonLosses', 'seasonWins']] = 0

game_set = game_set.dropna(subset = ['avg_teamScore'])


# Order the dataset by gameDate and gameId
game_set
