In [1]:
# imports
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
# URLs and ranges
base_url = "https://www.basketball-reference.com/leagues/NBA_"
boxscore_url = "https://www.basketball-reference.com/"
# https://www.basketball-reference.com/leagues/NBA_2022_games-november.html
years = range(2000, 2020, 1)
months = ["october", "november", "december", "january", "february", "march", "april", "may", "june", "july", "august", "september"]

In [None]:
index = 0
rows = []
for year in years:
    for month in months:
        try:
            page = requests.get(base_url + str(year) + "_games-" + month + ".html")
            soup = BeautifulSoup(page.content, 'html.parser')
            print(str(year) + " " + month)
        except:
            print("month and year combo don't match")
            pass
        try:
            games = soup.find_all(lambda tag: tag.name == 'td' and tag.get('class') == ['center'] and 'OT' not in tag.text)
        except:
            print("something went wrong with getting the games")
            pass
        for index, game in enumerate(games):
            try:
                boxscore = requests.get(boxscore_url + game.a['href'])
                boxsoup = BeautifulSoup(boxscore.content, 'html.parser')
                # Get scores
                scores = boxsoup.find_all(lambda tag: tag.name == 'div' and tag.get('class') == ['score'])
                # Get Box Scores
                teams = boxsoup.find_all(lambda tag: tag.name == 'h2' and '-' in tag.text)
                # Find Dates
                dates = boxsoup.find_all(lambda tag: tag.name == 'div' and tag.get('class') == ['scorebox_meta'])
                team1, team2 = teams[0].text.split(' ('), teams[1].text.split(' (')
                team1_name, team2_name = team1[0], team2[0]
                team1, team2 = team1[1].split('-'), team2[1].split('-')
                team1_win, team2_win = team1[0], team2[0]
                team1_loss, team2_loss = team1[1][:-1], team2[1][:-1]
                row = [index, team1_name, team2_name, team1_win, team1_loss, team2_win, team2_loss]
                boxes = boxsoup.find_all(lambda tag: tag.name == 'table' and '-' in tag.caption.text)
                boxes = [boxes[0].tbody, boxes[1].tbody]
                for box in boxes:
                    players = box.find_all('tr')
                    for i in range(5):
                        stats = players[i].find_all('td')
                        #Collect Stats
                        row.extend([players[i].th.a.text, stats[0].text, stats[1].text, stats[2].text, 
                                                     stats[4].text, stats[5].text, stats[7].text, stats[8].text, 
                                                      stats[9].text, stats[10].text, stats[11].text, stats[13].text, 
                                                    stats[14].text, stats[15].text, stats[16].text, stats[18].text])
                        #name, minutes, fg, fga, 3p, 3pa, ft, fta, ft%, oreb, dreb, treb, ast, stl, blk, tov, pts
                # Add scores and dates
                row.extend([scores[0].text, scores[1].text, dates[0].div.text.split(',')[1]])
                index += 1
                rows.append(row)
            except:
                print("something went wrong with getting the box scores for game " + str(index))
                pass

2022 november
113
118
103
111
128
113
97
115


In [5]:
df = pd.DataFrame(rows)

In [6]:
df.to_csv('seasons.csv', index = False)

In [None]:
games =pd.read_csv('seasons.csv', sep=',', thousands=',',encoding='unicode_escape')

In [None]:
games.head()

In [None]:
# Minutes Pre-processing
for i in range(10):
    games[str(i*15+8)] = games[str(i*15+8)].map(lambda x: int(x.split(':')[0])*60 + int(x.split(':')[1]))
games.head()

In [None]:
games['0'] = np.arange(len(games))

In [None]:
cols = [1, 2, 3, 4, 5, 6, 7, 9, 10, 'ast', 12, 13, 14, 'pts']
col_names = ['team1_wins', 'team1_losses', 'team2_wins', 'team2_losses', 'min', 'fg', 'fga', '3p', '3pa', 'ft', 'fta', 'oreb', 'dreb', 'ast', 'stl', 'blk', 'tov', 'pts']
for i in range(9):
    for column in col_names[4:18]:
        col_names.append(column+str(i+2))
col_names.append('team1pts')
col_names.append('team2pts')
col_names.append('dates')
names = ['game_number', 'names']
names.extend(col_names[4:18])
# Function for getting running averages stat totals
def past_game_data(player_name, df, index, names):
    keep = [0]
    keep.extend(list(range(7, (1)*17+7)))
    player_df = df.loc[(df['7'] == player_name) & (df['0'] < df['0'].iloc[index])].tail(3).iloc[:, lambda df: keep]
    player_df = player_df.drop(player_df.columns[[9, 12]], axis=1)
    for i in range(1, 10):
        keep = [0]
        keep.extend(list(range(i*17+7, (i+1)*17+7)))
        temp = df.loc[(df[str(i*15+7)] == player_name) & (df['0'] < df['0'].iloc[index])].tail(3).iloc[:, lambda df: keep]
        temp = temp.drop(temp.columns[[9, 12]], axis=1)
        player_df = pd.DataFrame(np.concatenate((player_df.values, temp.values), axis=0))
        player_df.columns = names
        player_df = player_df.sort_values(by=['game_number']).tail(3)

    return player_df
past_game_data('Kobe Bryant', games, 2000, names)

In [None]:
# Pre-process entire dataset and get player running 3 game totals 
rows = []
skip = False
# min, fg, fga, 3p, 3pa, ft, fta, oreb, dreb, ast, stl, blk, tov, pts
for i in range(len(games)):
    row = [games['3'][i], games['4'][i], games['5'][i], games['6'][i]] #records
    for j in range(10):
        temp = past_game_data(games[str(j*15+7)][i], games, i, names)
        if len(temp) < 3:
            skip = True
            break
        totals = []
        for col in col_names[4:18]:
            totals.append(sum(temp[col]))
        row.extend(totals)
    if not skip:
        row.append(games['team1pts'][i])
        row.append(games['team2pts'][i])
        row.append(games['dates'][i])
        rows.append(row)
    else:
        skip = False
    if i%1000 == 0:
        print(i)
df = pd.DataFrame(rows, columns=col_names)
df.head()

In [None]:
df.to_csv('final.csv', index=False)