## Fetch Gamelogs

In [33]:
# Import packages
import pandas as pd # type: ignore
import numpy as np # type: ignore
import time
import logging
import requests # type: ignore
import time
from tqdm import tqdm # type: ignore
import random
from bs4 import BeautifulSoup # type: ignore

In [39]:
# From seasons 
seasons = list(range(2000, 2024))
# seasons = list(range(2020, 2024)) # TODO: Temp

# Stats considered from basketball-reference
stats = ['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF']

# Current 20 teams
teams = ['ATL', 'BOS', 'BRK', 'CHO', 'CHI', 'CLE', 'DAL',
         'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAL', 'LAC', 'MEM',
         'MIA', 'MIL', 'MIN', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHO',
         'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

assert len(teams) == 30

In [35]:
# Team and Opposing Team statistics
tm_stats_dict = {stat: 'Tm_' + stat for stat in stats}
opp_stats_dict = {stat + '.1': 'Opp_' + stat for stat in stats}

# Get data into this dataframe
nba_df = pd.DataFrame()
years_parsed = pd.DataFrame() # To keep track of df tracked

# Teams and Seasons combinations
print('total combinations:', len(teams) * len(seasons))

In [41]:
# Set logging for the file parsing
logging.basicConfig(filename = 'parsing.log',
                    level = logging.INFO)

# Fetch gamelogs for the teams
for team in tqdm(teams, desc = 'teams parsed'):
    for season in tqdm(seasons, desc = team):
        url = 'https://www.basketball-reference.com/teams/' + team + '/' + str(season) + '/gamelog/'
        
        # If url is valid
        response = requests.head(url)
        if response.status_code == 200:
            logging.info('{} at {} archived | {}'.format(team, season, url))
            team_df = pd.read_html(url, header = 1, attrs = {'id': 'tgl_basic'})[0]

            # Drop where Rk is NULL
            team_df = team_df[(team_df['Rk'].str != '') & (team_df['Rk'].str.isnumeric())]
            
            # Drop blank columns
            team_df.drop(columns=['Rk', 'Unnamed: 24'], inplace=True)

            # Rename few columns
            team_df = team_df.rename(columns = {'Unnamed: 3': 'Home',
                                                'Tm': 'Tm_Pts',
                                                'Opp.1': 'Opp_Pts'})
            team_df = team_df.rename(columns = tm_stats_dict)
            team_df = team_df.rename(columns = opp_stats_dict)

            team_df['Home'] = team_df['Home'].apply(lambda x: 0 if x == '@' else 1)
            
            # Insert 'Season' and 'Team' into the dataset
            team_df.insert(loc = 0, column = 'Season', value = season)
            team_df.insert(loc = 1, column = 'Team', value = team.upper())
            nba_df = pd.concat([nba_df, team_df], ignore_index=True)    
        
        else:
            logging.info('NOT added {} at {} | {}'.format(team, season, url))
        
        # Sleep time between requests to abide by basketball-ref rules
        time.sleep(random.randint(4, 6))

# nba_df print
print(nba_df.head())

ATL: 100%|██████████| 24/24 [02:19<00:00,  5.81s/it]
BOS: 100%|██████████| 24/24 [02:29<00:00,  6.21s/it]139.34s/it]
BRK: 100%|██████████| 24/24 [02:19<00:00,  5.82s/it]145.09s/it]
CHO: 100%|██████████| 24/24 [02:24<00:00,  6.03s/it]142.60s/it]
CHI: 100%|██████████| 24/24 [02:21<00:00,  5.88s/it]143.43s/it]
CLE: 100%|██████████| 24/24 [02:25<00:00,  6.05s/it]2.63s/it]  
DAL: 100%|██████████| 24/24 [02:28<00:00,  6.20s/it]3.52s/it]
DEN: 100%|██████████| 24/24 [02:19<00:00,  5.80s/it]5.26s/it]
DET: 100%|██████████| 24/24 [02:21<00:00,  5.91s/it]3.30s/it]
GSW: 100%|██████████| 24/24 [02:26<00:00,  6.12s/it]2.88s/it]
HOU: 100%|██████████| 24/24 [02:16<00:00,  5.69s/it]44.08s/it]
IND: 100%|██████████| 24/24 [02:18<00:00,  5.76s/it]41.79s/it]
LAL: 100%|██████████| 24/24 [02:25<00:00,  6.07s/it]40.70s/it]
LAC: 100%|██████████| 24/24 [02:23<00:00,  5.96s/it]42.21s/it]
MEM: 100%|██████████| 24/24 [02:26<00:00,  6.10s/it]42.48s/it]
MIA: 100%|██████████| 24/24 [02:31<00:00,  6.32s/it]43.65s/it]
M

   Season Team  G        Date  Home  Opp W/L Tm_Pts Opp_Pts Tm_FG  ... Opp_FT  \
0    2000  ATL  1  1999-11-02     0  WAS   L     87      94    31  ...     13   
1    2000  ATL  2  1999-11-04     1  MIL   L    109     119    41  ...     33   
2    2000  ATL  3  1999-11-06     1  CHI   W    113      97    44  ...     24   
3    2000  ATL  4  1999-11-08     0  DEN   L    100     115    39  ...     19   
4    2000  ATL  5  1999-11-10     0  VAN   L     97     102    39  ...     13   

  Opp_FTA Opp_FT% Opp_ORB Opp_TRB Opp_AST Opp_STL Opp_BLK Opp_TOV Opp_PF  
0      16    .813      12      42      23       5       5      15     30  
1      36    .917      12      38      24      15       6      11     25  
2      35    .686      17      39      14       6       6      14     26  
3      24    .792      22      49      28       6      15       7     23  
4      16    .813      15      49      27       9      10      18     24  

[5 rows x 41 columns]





In [60]:
# Fetch NULLs in the dataset
print('NULLs in the dataset ', nba_df.isnull().sum().sum())
assert nba_df.isnull().sum().sum() == 0

# pandas to csv
nba_df.to_csv('pkl/NBAgames_2000_2024.csv', index = False)

# pandas to pickle
nba_df.to_pickle("pkl/nba_df.pkl")

NULLs in the dataset  0


## Data Processing

In [56]:
# Change str to datetime in nba_df
nba_df.Date = pd.to_datetime(nba_df.Date, format='%Y-%m-%d', errors = 'coerce')