In [15]:
#import all necessary libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [22]:
#URL creator
def URL_creator(Year, Advanced = False):
    if Advanced == False:
        url="https://www.basketball-reference.com/leagues/NBA_"+str(Year)+"_per_game.html"
    else:
        url="https://www.basketball-reference.com/leagues/NBA_"+str(Year)+"_advanced.html"
    return url

In [17]:
#Web scraping basketball-reference.com for individual stats

def find_stats(url, season, Advanced = False):
    # collect HTML data
    html = urlopen(url)
    # create beautiful soup object from HTML
    soup = BeautifulSoup(html, features="lxml")
    
    #Extract information #1
    info = soup.findAll('th')
    #Find the column names
    lista=[]
    for head in info:
        lista.append(head.text)
    #Per game and advanced stats have different number of columns
    if Advanced == False:
        columns=lista[:30]
    else:
        columns=lista[:29]
    #Find the ranks
    ranks=[]
    for element in lista:
        if element not in columns:
            ranks.append(element)
            
    #Extract information #2
    players = soup.findAll('td')
    #Find players stats
    listb = []
    for stat in players:
        listb.append(stat.text)
    #Divide stats per player
    nparray = np.array(listb)
    #Handle the two cases differently again
    if Advanced == False:
        table = nparray.reshape((int(np.shape(nparray)[0]/29),29))
    else:
        table = nparray.reshape((int(np.shape(nparray)[0]/28),28))
    
    #Transform to dataframe
    df = pd.DataFrame(table)
    df.insert(0, "-1", ranks)
    df.columns = columns
    #Handle the two cases differently again
    if Advanced == False:
        df.insert(30, "Year", season)
    else:
        df.insert(29, "Year", season)
    return df

In [32]:
#Web scraping basketball-reference.com for All-NBA selections per year

# URL to scrape
url = "https://www.basketball-reference.com/awards/all_league.html"
# collect HTML data
html = urlopen(url)
# create beautiful soup object from HTML
soup = BeautifulSoup(html, features="lxml")

#Extract information #1
info = soup.findAll('th')
#Find the column names
lista=[]
for head in info:
    lista.append(head.text)
columns=lista[:9]
#Fill in the empty column names
columns[4:9]=['C', 'F1', 'F2', 'G1', 'G2']
#Find the years
years=lista[9:]

#Extract information #2
selections = soup.findAll('td')
#Find players stats
listb = []
for teams in selections:
    listb.append(teams.text)
#Divide selections per teams
nparray = np.array(listb)
table = nparray.reshape((int(np.shape(nparray)[0]/8),8))
#Transform to dataframe
df = pd.DataFrame(table)
df.insert(0, "-1", years)
df.columns = columns
df

Unnamed: 0,Season,Lg,Tm,Voting,C,F1,F2,G1,G2
0,2021-22,NBA,1st,(V),Nikola Jokić C,Giannis Antetokounmpo F,Jayson Tatum F,Luka Dončić G,Devin Booker G
1,2021-22,NBA,2nd,(V),Joel Embiid C,Kevin Durant F,DeMar DeRozan F,Ja Morant G,Stephen Curry G
2,2021-22,NBA,3rd,(V),Karl-Anthony Towns C,LeBron James F,Pascal Siakam F,Chris Paul G,Trae Young G
3,,,,,,,,,
4,2020-21,NBA,1st,(V),Nikola Jokić C,Giannis Antetokounmpo F,Kawhi Leonard F,Stephen Curry G,Luka Dončić G
...,...,...,...,...,...,...,...,...,...
283,1947-48,BAA,1st,,Howie Dallmar,Bob Feerick,Joe Fulks,Ed Sadowski,Max Zaslofsky
284,1947-48,BAA,2nd,,Carl Braun,Buddy Jeannette,John Logan,Stan Miasek,Fred Scolari
285,,,,,,,,,
286,1946-47,BAA,1st,,Bob Feerick,Joe Fulks,Bones McKinney,Stan Miasek,Max Zaslofsky


In [33]:
#Edit df

#Drop unwanted rows
Last_line_we_want = df.index[df["Season"]=="1988-89"][2]
df=df.drop(df.index[Last_line_we_want+1:])
df=df[df['Tm'].astype(bool)]
#Clean cells
df['C']=df['C'].str[:-2]
df['F1']=df['F1'].str[:-2]
df['F2']=df['F2'].str[:-2]
df['G1']=df['G1'].str[:-2]
df['G2']=df['G2'].str[:-2]
#Add columns to match with the other df
df.insert(9, "Year", df['Season'].str[:4].astype(int)+1)
df.insert(10, "Selected", 1)
df

Unnamed: 0,Season,Lg,Tm,Voting,C,F1,F2,G1,G2,Year,Selected
0,2021-22,NBA,1st,(V),Nikola Jokić,Giannis Antetokounmpo,Jayson Tatum,Luka Dončić,Devin Booker,2022,1
1,2021-22,NBA,2nd,(V),Joel Embiid,Kevin Durant,DeMar DeRozan,Ja Morant,Stephen Curry,2022,1
2,2021-22,NBA,3rd,(V),Karl-Anthony Towns,LeBron James,Pascal Siakam,Chris Paul,Trae Young,2022,1
4,2020-21,NBA,1st,(V),Nikola Jokić,Giannis Antetokounmpo,Kawhi Leonard,Stephen Curry,Luka Dončić,2021,1
5,2020-21,NBA,2nd,(V),Joel Embiid,Julius Randle,LeBron James,Chris Paul,Damian Lillard,2021,1
...,...,...,...,...,...,...,...,...,...,...,...
129,1989-90,NBA,2nd,(V),Hakeem Olajuwon,Larry Bird,Tom Chambers,Kevin Johnson,John Stockton,1990,1
130,1989-90,NBA,3rd,(V),David Robinson,Chris Mullin,James Worthy,Clyde Drexler,Joe Dumars,1990,1
132,1988-89,NBA,1st,(V),Hakeem Olajuwon,Charles Barkley,Karl Malone,Magic Johnson,Michael Jordan,1989,1
133,1988-89,NBA,2nd,(V),Patrick Ewing,Tom Chambers,Chris Mullin,Kevin Johnson,John Stockton,1989,1


In [35]:
seasons=[*range(1989,2022+1)]
print(seasons)

[1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]


In [38]:
per_game_df=pd.DataFrame()
advanced_df=pd.DataFrame()
for season in seasons:
    pd.concat([per_game_df, find_stats(URL_creator(season),season)])

In [37]:
per_game_df

In [28]:
for season in seasons:
    pd.concat([advanced_df, find_stats(URL_creator(season, Advanced = True), season, Advanced = True)])

HTTPError: HTTP Error 429: Too Many Requests

In [30]:
#Combine dataframes
dif_cols = advanced_df.columns.difference(per_game_df.columns)
dfNew = merge(per_game_df, advanced_df[dif_cols], left_index=True, right_index=True, how='outer')
print(dfNew)

NameError: name 'merge' is not defined