<a href="https://colab.research.google.com/github/frankwillard/NBA-Web-Scraper-And-ANN/blob/main/Final_Web_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# import needed libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
import lxml

In [11]:
def remove_items(test_list, item):
      
    # using list comprehension to perform the task
    res = [i for i in test_list if i != item]
  
    return res

In [12]:
def multiple_replace(string, rep_dict):
    pattern = re.compile("|".join([re.escape(k) for k in sorted(rep_dict,key=len,reverse=True)]), flags=re.DOTALL)
    return pattern.sub(lambda x: rep_dict[x.group(0)], string)

In [29]:
# create a function to scrape team performance for multiple years
def scrape_NBA_team_data(years = [2017, 2018]):

    final_df = pd.DataFrame(columns = ['Year', 'Team', 'Age', 'W', 'L', 'PW', 'PL', 'MOV', 'SOS', 'SRS',
       'ORtg', 'DRtg', 'NRtg', 'Pace', 'FTr', '3PAr', 'TS%', 'OeFG%', 'OTOV%',
       'ORB%', 'OFT/FGA', 'DeFG%', 'DTOV%', 'DRB%', 'DFT/FGA', 'Arena', 'Attend.',
       'Playoffs', 'W/L%', 'Losing_season'])

    # loop through each year
    for y in years:
        # NBA season to scrape
        year = y
        
        # URL to scrape, notice f string:
        url = f"https://www.basketball-reference.com/leagues/NBA_{year}.html"
        
        # collect HTML data
        html = urlopen(url)
        
        # create beautiful soup object from HTML
        # soup = BeautifulSoup(html, "lxml")
        soup = BeautifulSoup(html, 'html.parser')

        league_champ_txt = soup.findAll("strong")[1]

        league_champ = league_champ_txt.find_next_sibling("a").getText()


        #rows = adv_table.tbody.find_all('tr')
        
        adv_table = soup.find(id='advanced-team')

        adv_cols = [th.getText() for th in adv_table.findAll('tr', limit=2)[1].findAll('th')]
        adv_cols = remove_items(adv_cols, '\xa0')
        adv_cols.remove('Attend./G')

        for i in [17,18,20]:
          adv_cols[i]="O"+adv_cols[i]
        
        for i in [21,22,24]:
          adv_cols[i]="D"+adv_cols[i]

        #df = pd.DataFrame(columns=["Year"].extend(adv_cols))

        reg_dict = {
            "+":"",
            ",":""
        }
        
        team_stats = []
        j = 0

        rows = adv_table.tbody.find_all('tr')
        for row in rows:
          columns = row.find_all('td')
          team_stats.append([multiple_replace(columns[i].getText(), reg_dict) for i in range(len(columns)-1) if columns[i].getText() != ''])
          # remove empty elements
          #team_stats = [e for e in team_stats if e != []]
          
          # add team name to each row in team_stats
        for i in range(0, len(team_stats)):
            team_stats[i].insert(0, year)
        
        # add team, year columns to headers
        adv_cols.insert(0, "Year")

        adv_cols.remove("Rk")
        
        # create a dataframe with all aquired info
        year_standings = pd.DataFrame(team_stats, columns = adv_cols)        
       
        # add a column to dataframe to indicate playoff appearance
        year_standings["Playoffs"] = ["Y" if "*" in ele else "N" for ele in year_standings["Team"]]

        # remove * from team names
        year_standings["Team"] = [ele.replace('*', '') for ele in year_standings["Team"]]

        for col in year_standings.columns:
          if col not in ["Team", "Arena", "Playoffs"]:
            year_standings[col] = year_standings[col].astype(float)
        # add losing season indicator (win % < .5)

        year_standings["W/L%"] = year_standings["W"] / (year_standings["W"] + year_standings["L"])

        year_standings["Losing_season"] = ["Y" if float(ele) < .5 else "N" for ele in year_standings["W/L%"]]

        year_standings["Champion"] = ["Y" if name == league_champ else "N" for name in year_standings["Team"]]
        
        #for i in [17,18,20]:
        #  year_standings = year_standings.rename(columns={year_standings.columns[i]: 'O'+year_standings.columns[i]})
        
        #for i in [21,22,25]:
        #  year_standings = year_standings.rename(columns={year_standings.columns[i]: 'D'+year_standings.columns[i]})

        #print(year_standings.columns)

        # append new dataframe to final_df
        final_df = pd.DataFrame(final_df)
        final_df = pd.concat([final_df, year_standings])
        # final_df = final_df.append(year_standings)
    
    final_df = final_df.sort_values(by=['Team', 'Year'])

    lag_1 = final_df['Champion'].shift(1)

    final_df['won_last'] = lag_1  # add to DataFrame

    concat = final_df['Champion'].shift(1) + final_df['Champion'].shift(2) + final_df['Champion'].shift(3)

    lag_3 = concat.str.contains("Y")

    pd.set_option('display.max_rows', 20)
    
    #print(lag_3)

    final_df['won_last_3'] = ["Y" if lagger else "N" for lagger in lag_3]  # add to DataFrame
    
    final_df = final_df[final_df.Year > 1989.0]

    final_df = final_df.sort_values(by=['Year', 'Team'])

    # print final_df
    # print(final_df.info)
    # export to csv
    final_df.to_csv("nba_team_advanced_data.csv", index=False)

In [30]:
scrape_NBA_team_data(years = [1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994,
                              1995, 1996, 1997, 1998, 1999,
                              2000, 2001, 2002, 2003, 2004,
                              2005, 2006, 2007, 2008, 2009,
                              2010, 2011, 2012, 2013, 2014,
                              2015, 2016, 2017, 2018, 2019,
                              2020, 2021, 2022])