## Extract Datasets

In [5]:
pip install nba-api

Collecting nba-api
  Using cached nba_api-1.1.9-py3-none-any.whl (242 kB)
Installing collected packages: nba-api
Successfully installed nba-api-1.1.9
Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install basketball-reference-scraper==v1.0.1

Note: you may need to restart the kernel to use updated packages.


In [7]:
#dependencies
import requests
import numpy as np
import pandas as pd 
import scipy.stats as st
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from nba_api.stats.static import players
from nba_api.stats.endpoints import playergamelog

from bs4 import BeautifulSoup as bs
from basketball_reference_scraper.teams import get_roster
from basketball_reference_scraper.players import get_stats

### Extract player stats using the BeautifulSoup

In [8]:
#season lookup
year = 2020

#URL for web scraping: the 1st url looks for players stas per game and the 2nd url looks for players stats total
url = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'.format(year)
url2 = 'https://www.basketball-reference.com/leagues/NBA_{}_totals.html'.format(year)

#Retrieve page with the requests module
response = requests.get(url)

#Create BeautifulSoup object; parse with 'html.parser'
soup = bs(response.text, 'html.parser')

#use findALL() to get the column headers
soup.findAll('tr', limit=2)

#use getText()to extract the text into a list
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

#exclude the first column
headers = headers[1:]

In [9]:
#check url
print(url)

https://www.basketball-reference.com/leagues/NBA_2020_per_game.html


In [10]:
#exclude the first header row
rows = soup.findAll('tr')[1:]
player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

In [11]:
#include heading(column names)
stats_df = pd.DataFrame(player_stats, columns = headers)
stats_df

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Steven Adams,C,26,OKC,63,63,26.7,4.5,7.6,.592,...,.582,3.3,6.0,9.3,2.3,0.8,1.1,1.5,1.9,10.9
1,Bam Adebayo,PF,22,MIA,72,72,33.6,6.1,11.0,.557,...,.691,2.4,7.8,10.2,5.1,1.1,1.3,2.8,2.5,15.9
2,LaMarcus Aldridge,C,34,SAS,53,53,33.1,7.4,15.0,.493,...,.827,1.9,5.5,7.4,2.4,0.7,1.6,1.4,2.4,18.9
3,Kyle Alexander,C,23,MIA,2,0,6.5,0.5,1.0,.500,...,,1.0,0.5,1.5,0.0,0.0,0.0,0.5,0.5,1.0
4,Nickeil Alexander-Walker,SG,21,NOP,47,1,12.6,2.1,5.7,.368,...,.676,0.2,1.6,1.8,1.9,0.4,0.2,1.1,1.2,5.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
672,Trae Young,PG,21,ATL,60,60,35.3,9.1,20.8,.437,...,.860,0.5,3.7,4.3,9.3,1.1,0.1,4.8,1.7,29.6
673,Cody Zeller,C,27,CHO,58,39,23.1,4.3,8.3,.524,...,.682,2.8,4.3,7.1,1.5,0.7,0.4,1.3,2.4,11.1
674,Tyler Zeller,C,30,SAS,2,0,2.0,0.5,2.0,.250,...,,1.5,0.5,2.0,0.0,0.0,0.0,0.0,0.0,1.0
675,Ante Žižić,C,23,CLE,22,0,10.0,1.9,3.3,.569,...,.737,0.8,2.2,3.0,0.3,0.3,0.2,0.5,1.2,4.4


In [12]:
#converting dataframe to json
json = stats_df.to_json()

#print(json).prettify()

In [13]:
#saving as json file
stats_df.to_json(r'Resources/player_per_game_stats.json')

### Extract Player Information using basketball_reference_scraper API

In [14]:
#get the names of all teams within the NBA
f=open("Resources/Team_Name.txt","r")
lines=f.readlines()

result=[]

for x in lines:
    result.append(x.split(':')[1])
f.close()

In [15]:
#Cleanup the format of all team names
clean_teamlist = list(map(lambda each:each.strip("\n"), result))
clean_teamlist = [x.strip(' ') for x in clean_teamlist]

In [16]:
#loop through all teams to get all player information active in 2020
frames = []
current_teams = []

for counter, t in enumerate(clean_teamlist):
    try:
        globals()["df"+str(counter)] = get_roster(t,2020)
        frames.append(globals()["df"+str(counter)])
        current_teams.append(str(t))
    except:
        pass

results = pd.concat(frames)

results

Unnamed: 0,NUMBER,PLAYER,POS,HEIGHT,WEIGHT,BIRTH_DATE,NATIONALITY,EXPERIENCE,COLLEGE
0,95,DeAndre' Bembry,SG,6-5,210,1994-07-04,US,3,Saint Joseph's
1,4,Charlie Brown,SG,6-6,199,1997-02-02,US,R,Saint Joseph's
2,15,Vince Carter,PF,6-6,220,1977-01-26,US,21,UNC
3,20,John Collins,PF,6-9,235,1997-09-23,US,2,Wake Forest
4,33,Allen Crabbe,SG,6-5,212,1992-04-09,US,6,California
...,...,...,...,...,...,...,...,...,...
18,14,Ish Smith,PG,6-0,175,1988-07-05,US,9,Wake Forest
19,4,Isaiah Thomas,PG,5-9,185,1989-02-07,US,8,Washington
20,7,Jarrod Uthoff,PF,6-9,221,1993-05-19,US,1,Iowa
21,21,Moritz Wagner,C,6-11,245,1997-04-26,DE,1,Michigan


In [19]:
#saving table as a json file
results.to_csv(r'Resources/active_players_info.csv')


### NBA API EXTRACTION

In [82]:
#api players call
player_dict = players.get_players()

#playoff stats of the greatest 13 players

bill_russell = playergamelog.PlayerGameLog(player_id='78049', season = 'ALL', season_type_all_star= 'Playoffs')
bill_russell = bill_russell.get_data_frames()

wilt_chamberlain = playergamelog.PlayerGameLog(player_id='76375', season = 'ALL', season_type_all_star= 'Playoffs')
wilt_chamberlain = wilt_chamberlain.get_data_frames()

kareem_abdul_jabbar = playergamelog.PlayerGameLog(player_id='76003', season = 'ALL', season_type_all_star= 'Playoffs')
kareem_abdul_jabbar = kareem_abdul_jabbar.get_data_frames()

magic_johnson = playergamelog.PlayerGameLog(player_id='77142', season = 'ALL', season_type_all_star= 'Playoffs')
magic_johnson = magic_johnson.get_data_frames()

larry_bird = playergamelog.PlayerGameLog(player_id='1449', season = 'ALL', season_type_all_star= 'Playoffs')
larry_bird = larry_bird.get_data_frames()

michael_jordan = playergamelog.PlayerGameLog(player_id='893', season = 'ALL', season_type_all_star= 'Playoffs')
michael_jordan = michael_jordan.get_data_frames()

scottie_pippen = playergamelog.PlayerGameLog(player_id='937', season = 'ALL', season_type_all_star= 'Playoffs')
scottie_pippen = scottie_pippen.get_data_frames()

shaquille_oneal = playergamelog.PlayerGameLog(player_id='406', season = 'ALL', season_type_all_star= 'Playoffs')
shaquille_oneal = shaquille_oneal.get_data_frames()

kobe_brian = playergamelog.PlayerGameLog(player_id='977', season = 'ALL', season_type_all_star= 'Playoffs')
kobe_brian = kobe_brian.get_data_frames()

lebron_james = playergamelog.PlayerGameLog(player_id='2544', season = 'ALL', season_type_all_star= 'Playoffs')
lebron_james = lebron_james.get_data_frames()

stephen_curry = playergamelog.PlayerGameLog(player_id='201939', season = 'ALL', season_type_all_star= 'Playoffs')
stephen_curry = stephen_curry.get_data_frames()

kevin_durant = playergamelog.PlayerGameLog(player_id='201142', season = 'ALL', season_type_all_star= 'Playoffs')
kevin_durant = kevin_durant.get_data_frames()

kawhi_leonard = playergamelog.PlayerGameLog(player_id='202695', season = 'ALL', season_type_all_star= 'Playoffs')
kawhi_leonard = kawhi_leonard.get_data_frames()

#convert player info into dataframe
player_df = pd.DataFrame(player_dict)

#select columns of interest from each players database and store seperately
bill_russell_ss = bill_russell[0][['Player_ID', 'MATCHUP', 'WL', 'FG3_PCT', 'FGM', 'FTM', 'MIN', 'REB', 'AST', 'PTS', 'BLK', 'STL']]
wilt_chamberlain_ss = wilt_chamberlain[0][['Player_ID', 'MATCHUP', 'WL', 'FG3_PCT', 'FGM', 'FTM', 'MIN', 'REB', 'AST', 'PTS', 'BLK', 'STL']]
kareem_abdul_jabbar_ss = kareem_abdul_jabbar[0][['Player_ID', 'MATCHUP', 'WL', 'FG3_PCT', 'FGM', 'FTM', 'MIN', 'REB', 'AST', 'PTS', 'BLK', 'STL']]
magic_johnson_ss = magic_johnson[0][['Player_ID', 'MATCHUP', 'WL', 'FG3_PCT', 'FGM', 'FTM', 'MIN', 'REB', 'AST', 'PTS', 'BLK', 'STL']]
larry_bird_ss = larry_bird[0][['Player_ID', 'MATCHUP', 'WL', 'FG3_PCT', 'FGM', 'FTM', 'MIN', 'REB', 'AST', 'PTS', 'BLK', 'STL']]
michael_jordan_ss = michael_jordan[0][['Player_ID', 'MATCHUP', 'WL', 'FG3_PCT', 'FGM', 'FTM', 'MIN', 'REB', 'AST', 'PTS', 'BLK', 'STL']]
scottie_pippen_ss = scottie_pippen[0][['Player_ID', 'MATCHUP', 'WL', 'FG3_PCT', 'FGM', 'FTM', 'MIN', 'REB', 'AST', 'PTS', 'BLK', 'STL']]
shaquille_oneal_ss = shaquille_oneal[0][['Player_ID', 'MATCHUP', 'WL', 'FG3_PCT', 'FGM', 'FTM', 'MIN', 'REB', 'AST', 'PTS', 'BLK', 'STL']]
kobe_brian_ss = kobe_brian[0][['Player_ID', 'MATCHUP', 'WL', 'FG3_PCT', 'FGM', 'FTM', 'MIN', 'REB', 'AST', 'PTS', 'BLK', 'STL']]
lebron_james_ss = lebron_james[0][['Player_ID', 'MATCHUP', 'WL', 'FG3_PCT', 'FGM', 'FTM', 'MIN', 'REB', 'AST', 'PTS', 'BLK', 'STL']]
stephen_curry_ss = stephen_curry[0][['Player_ID', 'MATCHUP', 'WL', 'FG3_PCT', 'FGM', 'FTM', 'MIN', 'REB', 'AST', 'PTS', 'BLK', 'STL']]
kevin_durant_ss = kevin_durant[0][['Player_ID', 'MATCHUP', 'WL', 'FG3_PCT', 'FGM', 'FTM', 'MIN', 'REB', 'AST', 'PTS', 'BLK', 'STL']]
kawhi_leonard_ss = kawhi_leonard[0][['Player_ID', 'MATCHUP', 'WL', 'FG3_PCT', 'FGM', 'FTM', 'MIN', 'REB', 'AST', 'PTS', 'BLK', 'STL']]



In [83]:
#insert integers to replace Nan values
bill_russell_ss.insert(0, 'name', 'Bill Russel')
wilt_chamberlain_ss.insert(0, 'name', 'Wilt Chamberlain') 
kareem_abdul_jabbar_ss.insert(0, 'name', 'Kareem Abdul-Jabbar')
larry_bird_ss.insert(0, 'name', "Larry Bird")
michael_jordan_ss.insert(0, 'name', "Michael Jordan")
scottie_pippen_ss.insert(0, 'name', "Scottie Pippen")
shaquille_oneal_ss.insert(0, 'name', "Shaquille O'Neal")
kobe_brian_ss.insert(0, 'name', "Kobe Bryant") 
lebron_james_ss.insert(0, 'name', 'LeBron James')
stephen_curry_ss.insert(0, 'name', "Stephen Curry")
kevin_durant_ss.insert(0, 'name', 'Kevin Durant')
kawhi_leonard_ss.insert(0, 'name', 'Kawhi Leonard')

In [90]:
#data frame of selected columns with all 13 great players
frames_goat_games = [bill_russell_ss, wilt_chamberlain_ss, kareem_abdul_jabbar_ss, 
                     magic_johnson_ss, larry_bird_ss, michael_jordan_ss, 
                     scottie_pippen_ss, shaquille_oneal_ss, kobe_brian_ss, 
                     lebron_james_ss, stephen_curry_ss, kevin_durant_ss, kawhi_leonard_ss]

#combine both dataframes
GOAT_playoffs_games = pd.concat(frames_goat_games, sort = False)

#concatenated table
GOAT_playoffs_games

Unnamed: 0,name,Player_ID,MATCHUP,WL,FG3_PCT,FGM,FTM,MIN,REB,AST,PTS,BLK,STL
0,Bill Russel,78049,BOS @ LAL,W,,2,2,48,21,6.0,6,,
1,Bill Russel,78049,BOS vs. LAL,W,,3,3,48,19,2.0,9,,
2,Bill Russel,78049,BOS @ LAL,L,,3,1,48,13,5.0,7,,
3,Bill Russel,78049,BOS vs. LAL,W,,2,2,48,29,2.0,6,,
4,Bill Russel,78049,BOS vs. LAL,W,,5,1,48,18,3.0,11,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,Kawhi Leonard,202695,SAS vs. LAC,W,1,5,3,28,6,1.0,16,1,3
120,Kawhi Leonard,202695,SAS @ UTA,W,0.5,1,0,20,5,0.0,3,0,0
121,Kawhi Leonard,202695,SAS @ UTA,W,0,1,0,15,3,0.0,2,0,0
122,Kawhi Leonard,202695,SAS vs. UTA,W,0.75,6,2,25,3,1.0,17,0,2


## Basektball-reference GOAT extraction

In [85]:

df_jordan = get_stats('Michael Jordan', stat_type='PER_GAME', playoffs=True, career=False)
df_james = get_stats('LeBron James', stat_type='PER_GAME', playoffs=True, career=False)
df_kareem = get_stats('Kareem Abdul-Jabbar', stat_type='PER_GAME', playoffs=True, career=False)
df_russel = get_stats('Bill Russell', stat_type='PER_GAME', playoffs=True, career=False)
df_magic = get_stats('Magic Johnson', stat_type='PER_GAME', playoffs=True, career=False)
df_bird = get_stats('Larry Bird', stat_type='PER_GAME', playoffs=True, career=False)
df_pippen = get_stats('Scottie Pippen', stat_type='PER_GAME', playoffs=True, career=False)
df_oneal = get_stats("Shaquille O'Neal", stat_type='PER_GAME', playoffs=True, career=False)
df_kobe = get_stats('Kobe Bryant', stat_type='PER_GAME', playoffs=True, career=False)
df_durant = get_stats('Kevin Durant', stat_type='PER_GAME', playoffs=True, career=False)
df_kawhi = get_stats('Kawhi Leonard', stat_type='PER_GAME', playoffs=True, career=False)
df_steph = get_stats('Stephen Curry', stat_type='PER_GAME', playoffs=True, career=False)

In [86]:
data_jordan = pd.DataFrame(df_jordan)
data_jordan.insert(0, 'name', 'Michael Jordan')
data_james = pd.DataFrame(df_james)
data_james.insert(0, 'name', 'LeBron James')
data_kareem = pd.DataFrame(df_kareem)
data_kareem.insert(0, 'name', 'Kareem Abdul-Jabbar')
data_russel = pd.DataFrame(df_russel)
data_russel.insert(0, 'name', 'Bill Russell')
data_magic = pd.DataFrame(df_magic)
data_magic.insert(0, 'name', 'Magic Johnson')
data_bird = pd.DataFrame(df_bird)
data_bird.insert(0, 'name', 'Larry Bird')
data_pippen = pd.DataFrame(df_pippen)
data_pippen.insert(0, 'name', "Scottie Pippen")
data_oneal = pd.DataFrame(df_oneal)
data_oneal.insert(0, 'name', "Shaquille O'Neal")
data_kobe = pd.DataFrame(df_kobe)
data_kobe.insert(0, 'name', 'Kobe Bryant')
data_durant = pd.DataFrame(df_durant)
data_durant.insert(0, 'name', 'Kevin Durant')
data_kawhi = pd.DataFrame(df_kawhi)
data_kawhi.insert(0, 'name', 'Kawhi Leonard')
data_steph = pd.DataFrame(df_steph)
data_steph.insert(0, 'name', 'Steph Curry')



In [87]:
frames_data = [data_jordan ,data_james ,data_kareem ,data_russel ,data_magic ,data_bird ,data_pippen ,data_oneal ,data_kobe ,data_durant ,data_kawhi ,data_steph]
GOAT_season_avg = pd.concat(frames_data, sort = False)
GOAT_season_avg

Unnamed: 0,name,SEASON,AGE,TEAM,LEAGUE,POS,G,GS,MP,FG,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Michael Jordan,1984-85,21.0,CHI,NBA,SG,4.0,4.0,42.8,8.5,...,0.83,1.8,4.0,5.8,8.5,2.8,1.0,3.8,3.8,29.3
1,Michael Jordan,1985-86,22.0,CHI,NBA,SG,3.0,3.0,45.0,16.0,...,0.87,1.7,4.7,6.3,5.7,2.3,1.3,4.7,4.3,43.7
2,Michael Jordan,1986-87,23.0,CHI,NBA,SG,3.0,3.0,42.7,11.7,...,0.90,2.3,4.7,7.0,6.0,2.0,2.3,2.7,3.7,35.7
3,Michael Jordan,1987-88,24.0,CHI,NBA,SG,10.0,10.0,42.7,13.8,...,0.87,2.3,4.8,7.1,4.7,2.4,1.1,3.9,3.8,36.3
4,Michael Jordan,1988-89,25.0,CHI,NBA,SG,17.0,17.0,42.2,11.7,...,0.80,1.5,5.5,7.0,7.6,2.5,0.8,4.0,3.8,34.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,Steph Curry,2014-15,26.0,GSW,NBA,PG,21.0,21.0,39.3,9.5,...,0.83,1.0,4.0,5.0,6.4,1.9,0.1,3.9,2.2,28.3
3,Steph Curry,2015-16,27.0,GSW,NBA,PG,18.0,17.0,34.1,8.2,...,0.92,0.8,4.7,5.5,5.2,1.4,0.3,4.2,2.2,25.1
4,Steph Curry,2016-17,28.0,GSW,NBA,PG,17.0,17.0,35.4,8.9,...,0.90,1.3,4.9,6.2,6.7,2.0,0.2,3.4,2.2,28.1
5,Steph Curry,2017-18,29.0,GSW,NBA,PG,15.0,14.0,37.0,9.1,...,0.96,0.6,5.5,6.1,5.4,1.7,0.7,2.9,2.5,25.5


### GOAT PLAYOFF TABLES


In [88]:
GOAT_playoffs_games

Unnamed: 0,name,Player_ID,MATCHUP,WL,FG3_PCT,FGM,FTM,MIN,REB,AST,PTS,BLK,STL
0,Bill Russel,78049,BOS @ LAL,W,,2,2,48,21,6.0,6,,
1,Bill Russel,78049,BOS vs. LAL,W,,3,3,48,19,2.0,9,,
2,Bill Russel,78049,BOS @ LAL,L,,3,1,48,13,5.0,7,,
3,Bill Russel,78049,BOS vs. LAL,W,,2,2,48,29,2.0,6,,
4,Bill Russel,78049,BOS vs. LAL,W,,5,1,48,18,3.0,11,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,Kawhi Leonard,202695,SAS vs. LAC,W,1,5,3,28,6,1.0,16,1,3
120,Kawhi Leonard,202695,SAS @ UTA,W,0.5,1,0,20,5,0.0,3,0,0
121,Kawhi Leonard,202695,SAS @ UTA,W,0,1,0,15,3,0.0,2,0,0
122,Kawhi Leonard,202695,SAS vs. UTA,W,0.75,6,2,25,3,1.0,17,0,2


In [89]:
GOAT_playoffs_games.to_csv(r'Resources/GOAT_playoffs_games.csv')