# Web scraping Tutorial

* This notebook is a quick reference on how to use beautifulsoup4 to download data from websites- specifically basketball-reference.com

In [48]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.request import urlopen
import os

In [49]:
url = 'https://www.basketball-reference.com/players/e/embiijo01/gamelog/2021'
page = requests.get(url)
page

<Response [200]>

In [50]:
html = urlopen(url)
html

<http.client.HTTPResponse at 0x7fb4a48ff390>

In [51]:
soup = BeautifulSoup(html)
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" data-root="/home/bbr/deploy/www" data-version="klecko-" itemscope="" itemtype="https://schema.org/WebSite" lang="en">
 <head>
  <!-- Quantcast Choice. Consent Manager Tag v2.0 (for TCF 2.0) -->
  <script async="true" type="text/javascript">
   (function() {
	var host = window.location.hostname;
	var element = document.createElement('script');
	var firstScript = document.getElementsByTagName('script')[0];
	var url = 'https://quantcast.mgr.consensu.org'
	    .concat('/choice/', 'XwNYEpNeFfhfr', '/', host, '/choice.js')
	var uspTries = 0;
	var uspTriesLimit = 3;
	element.async = true;
	element.type = 'text/javascript';
	element.src = url;
	
	firstScript.parentNode.insertBefore(element, firstScript);
	
	function makeStub() {
	    var TCF_LOCATOR_NAME = '__tcfapiLocator';
	    var queue = [];
	    var win = window;
	    var cmpFrame;
	    
	    function addFrame() {
		var doc = win.document;
		var otherCMP = !!(win.frames[TCF_LOCATOR_NAME]);
		
		if (!oth

In [52]:
# using findAll toget the column headers

soup.findAll('thead')

headers = [th.getText() for th in soup.findAll('thead')[0].findAll('th')]
headers 

['Rk',
 'G',
 'Date',
 'Age',
 'Tm',
 '\xa0',
 'Opp',
 '\xa0',
 'GS',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS',
 'GmSc',
 '+/-']

In [53]:
headers.remove('Rk')
headers

['G',
 'Date',
 'Age',
 'Tm',
 '\xa0',
 'Opp',
 '\xa0',
 'GS',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS',
 'GmSc',
 '+/-']

In [54]:
#stats = [[td.gettext() for td in soup.findAll('td', {'data-stat': stat})] for stat in headers]
# shrink it down to just the data table stats

# this table_body returns as a list
#table_body = soup.findAll('tbody')
#table_body

#def table_body(soup):
   # return soup.has_attr('data-stat')
#rows = table_body.find_all('tr')

table_body = soup.tbody
#table_body

rows = table_body.findAll('tr')
rows


[<tr id="pgl_basic.210"><th class="right " csk="1" data-stat="ranker" scope="row">1</th><td class="right endpoint tooltip" data-endpoint="/players/pgl_cum_stats.cgi?player=embiijo01&amp;year=2021&amp;date_game=2020-12-23&amp;is_playoff_game=N" data-stat="game_season"><strong>1</strong></td><td class="left " data-stat="date_game"><a href="/boxscores/202012230PHI.html">2020-12-23</a></td><td class="right " data-stat="age">26-282</td><td class="left " data-stat="team_id"><a href="/teams/PHI/2021.html">PHI</a></td><td class="center iz" data-stat="game_location"></td><td class="left " data-stat="opp_id"><a href="/teams/WAS/2021.html">WAS</a></td><td class="center " csk="6" data-stat="game_result">W (+6)</td><td class="right " data-stat="gs">1</td><td class="right " csk="2090" data-stat="mp">34:50</td><td class="right " data-stat="fg">10</td><td class="right " data-stat="fga">17</td><td class="right " data-stat="fg_pct">.588</td><td class="right " data-stat="fg3">1</td><td class="right " dat

In [55]:
#rk = soup.find_all(attrs={'data-stat':'ranker'})
#rk


In [56]:
player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
player_stats

[['1',
  '2020-12-23',
  '26-282',
  'PHI',
  '',
  'WAS',
  'W (+6)',
  '1',
  '34:50',
  '10',
  '17',
  '.588',
  '1',
  '1',
  '1.000',
  '8',
  '9',
  '.889',
  '2',
  '12',
  '14',
  '2',
  '0',
  '1',
  '3',
  '3',
  '29',
  '23.6',
  '-2'],
 ['2',
  '2020-12-26',
  '26-285',
  'PHI',
  '@',
  'NYK',
  'W (+20)',
  '1',
  '30:39',
  '10',
  '20',
  '.500',
  '2',
  '4',
  '.500',
  '5',
  '7',
  '.714',
  '4',
  '6',
  '10',
  '2',
  '1',
  '1',
  '1',
  '3',
  '27',
  '21.7',
  '+18'],
 ['', '2020-12-27', '26-286', 'PHI', '@', 'CLE', 'L (-24)', 'Did Not Dress'],
 ['3',
  '2020-12-29',
  '26-288',
  'PHI',
  '',
  'TOR',
  'W (+7)',
  '1',
  '37:53',
  '7',
  '17',
  '.412',
  '1',
  '6',
  '.167',
  '14',
  '16',
  '.875',
  '4',
  '12',
  '16',
  '4',
  '2',
  '2',
  '5',
  '1',
  '29',
  '26.3',
  '+15'],
 ['4',
  '2020-12-31',
  '26-290',
  'PHI',
  '@',
  'ORL',
  'W (+24)',
  '1',
  '26:41',
  '7',
  '11',
  '.636',
  '1',
  '2',
  '.500',
  '6',
  '8',
  '.750',
  '0',
  

In [57]:
gamelog = pd.DataFrame(player_stats, columns = headers)
gamelog

Unnamed: 0,G,Date,Age,Tm,Unnamed: 5,Opp,Unnamed: 7,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,1.0,2020-12-23,26-282,PHI,,WAS,W (+6),1,34:50,10.0,17.0,0.588,1.0,1.0,1.0,8.0,9.0,0.889,2.0,12.0,14.0,2.0,0.0,1.0,3.0,3.0,29.0,23.6,-2.0
1,2.0,2020-12-26,26-285,PHI,@,NYK,W (+20),1,30:39,10.0,20.0,0.5,2.0,4.0,0.5,5.0,7.0,0.714,4.0,6.0,10.0,2.0,1.0,1.0,1.0,3.0,27.0,21.7,18.0
2,,2020-12-27,26-286,PHI,@,CLE,L (-24),Did Not Dress,,,,,,,,,,,,,,,,,,,,,
3,3.0,2020-12-29,26-288,PHI,,TOR,W (+7),1,37:53,7.0,17.0,0.412,1.0,6.0,0.167,14.0,16.0,0.875,4.0,12.0,16.0,4.0,2.0,2.0,5.0,1.0,29.0,26.3,15.0
4,4.0,2020-12-31,26-290,PHI,@,ORL,W (+24),1,26:41,7.0,11.0,0.636,1.0,2.0,0.5,6.0,8.0,0.75,0.0,9.0,9.0,2.0,0.0,2.0,0.0,2.0,21.0,20.0,23.0
5,5.0,2021-01-02,26-292,PHI,,CHO,W (+15),1,36:44,7.0,11.0,0.636,1.0,2.0,0.5,4.0,6.0,0.667,3.0,11.0,14.0,4.0,1.0,1.0,5.0,2.0,19.0,17.4,19.0
6,6.0,2021-01-04,26-294,PHI,,CHO,W (+17),1,24:54,4.0,10.0,0.4,0.0,3.0,0.0,6.0,8.0,0.75,1.0,10.0,11.0,4.0,2.0,3.0,1.0,1.0,14.0,17.0,18.0
7,7.0,2021-01-06,26-296,PHI,,WAS,W (+5),1,36:36,11.0,20.0,0.55,3.0,4.0,0.75,13.0,13.0,1.0,0.0,8.0,8.0,5.0,3.0,3.0,5.0,1.0,38.0,34.0,14.0
8,8.0,2021-01-07,26-297,PHI,@,BRK,L (-13),1,29:52,7.0,14.0,0.5,2.0,2.0,1.0,4.0,5.0,0.8,0.0,12.0,12.0,3.0,0.0,1.0,5.0,5.0,20.0,12.0,-16.0
9,,2021-01-09,26-299,PHI,,DEN,L (-12),Inactive,,,,,,,,,,,,,,,,,,,,,


In [58]:
gamelog.to_csv("joelembiid_gamelog.csv")

In [59]:
df_embiid = pd.read_csv('joelembiid_gamelog.csv')

In [60]:
df_embiid["FPTS"] = (df_embiid.PTS) + (df_embiid.TRB*1.2) + (df_embiid.AST*1.5) + (df_embiid.STL*3) + (df_embiid.BLK*3) + (df_embiid.TOV*-1)

In [61]:
df_embiid

Unnamed: 0.1,Unnamed: 0,G,Date,Age,Tm,Unnamed: 6,Opp,.1,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,FPTS
0,0,1.0,2020-12-23,26-282,PHI,,WAS,W (+6),1,34:50,10.0,17.0,0.588,1.0,1.0,1.0,8.0,9.0,0.889,2.0,12.0,14.0,2.0,0.0,1.0,3.0,3.0,29.0,23.6,-2.0,48.8
1,1,2.0,2020-12-26,26-285,PHI,@,NYK,W (+20),1,30:39,10.0,20.0,0.5,2.0,4.0,0.5,5.0,7.0,0.714,4.0,6.0,10.0,2.0,1.0,1.0,1.0,3.0,27.0,21.7,18.0,47.0
2,2,,2020-12-27,26-286,PHI,@,CLE,L (-24),Did Not Dress,,,,,,,,,,,,,,,,,,,,,,
3,3,3.0,2020-12-29,26-288,PHI,,TOR,W (+7),1,37:53,7.0,17.0,0.412,1.0,6.0,0.167,14.0,16.0,0.875,4.0,12.0,16.0,4.0,2.0,2.0,5.0,1.0,29.0,26.3,15.0,61.2
4,4,4.0,2020-12-31,26-290,PHI,@,ORL,W (+24),1,26:41,7.0,11.0,0.636,1.0,2.0,0.5,6.0,8.0,0.75,0.0,9.0,9.0,2.0,0.0,2.0,0.0,2.0,21.0,20.0,23.0,40.8
5,5,5.0,2021-01-02,26-292,PHI,,CHO,W (+15),1,36:44,7.0,11.0,0.636,1.0,2.0,0.5,4.0,6.0,0.667,3.0,11.0,14.0,4.0,1.0,1.0,5.0,2.0,19.0,17.4,19.0,42.8
6,6,6.0,2021-01-04,26-294,PHI,,CHO,W (+17),1,24:54,4.0,10.0,0.4,0.0,3.0,0.0,6.0,8.0,0.75,1.0,10.0,11.0,4.0,2.0,3.0,1.0,1.0,14.0,17.0,18.0,47.2
7,7,7.0,2021-01-06,26-296,PHI,,WAS,W (+5),1,36:36,11.0,20.0,0.55,3.0,4.0,0.75,13.0,13.0,1.0,0.0,8.0,8.0,5.0,3.0,3.0,5.0,1.0,38.0,34.0,14.0,68.1
8,8,8.0,2021-01-07,26-297,PHI,@,BRK,L (-13),1,29:52,7.0,14.0,0.5,2.0,2.0,1.0,4.0,5.0,0.8,0.0,12.0,12.0,3.0,0.0,1.0,5.0,5.0,20.0,12.0,-16.0,36.9
9,9,,2021-01-09,26-299,PHI,,DEN,L (-12),Inactive,,,,,,,,,,,,,,,,,,,,,,


In [62]:
# fucntion to turn minutes played into a float. 

def min_to_float(x):
    if pd.isna(x)==True:
        return 0
    else:
        mins = int(x.split(':')[0])
        secs = int(x.split(':')[1])/60.0
        val = mins + secs
        return round(val, 1)

# apply to df

df_embiid['MP'] = df_embiid['MP'].apply(min_to_float)

In [63]:
df_embiid

Unnamed: 0.1,Unnamed: 0,G,Date,Age,Tm,Unnamed: 6,Opp,.1,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,FPTS
0,0,1.0,2020-12-23,26-282,PHI,,WAS,W (+6),1,34.8,10.0,17.0,0.588,1.0,1.0,1.0,8.0,9.0,0.889,2.0,12.0,14.0,2.0,0.0,1.0,3.0,3.0,29.0,23.6,-2.0,48.8
1,1,2.0,2020-12-26,26-285,PHI,@,NYK,W (+20),1,30.6,10.0,20.0,0.5,2.0,4.0,0.5,5.0,7.0,0.714,4.0,6.0,10.0,2.0,1.0,1.0,1.0,3.0,27.0,21.7,18.0,47.0
2,2,,2020-12-27,26-286,PHI,@,CLE,L (-24),Did Not Dress,0.0,,,,,,,,,,,,,,,,,,,,,
3,3,3.0,2020-12-29,26-288,PHI,,TOR,W (+7),1,37.9,7.0,17.0,0.412,1.0,6.0,0.167,14.0,16.0,0.875,4.0,12.0,16.0,4.0,2.0,2.0,5.0,1.0,29.0,26.3,15.0,61.2
4,4,4.0,2020-12-31,26-290,PHI,@,ORL,W (+24),1,26.7,7.0,11.0,0.636,1.0,2.0,0.5,6.0,8.0,0.75,0.0,9.0,9.0,2.0,0.0,2.0,0.0,2.0,21.0,20.0,23.0,40.8
5,5,5.0,2021-01-02,26-292,PHI,,CHO,W (+15),1,36.7,7.0,11.0,0.636,1.0,2.0,0.5,4.0,6.0,0.667,3.0,11.0,14.0,4.0,1.0,1.0,5.0,2.0,19.0,17.4,19.0,42.8
6,6,6.0,2021-01-04,26-294,PHI,,CHO,W (+17),1,24.9,4.0,10.0,0.4,0.0,3.0,0.0,6.0,8.0,0.75,1.0,10.0,11.0,4.0,2.0,3.0,1.0,1.0,14.0,17.0,18.0,47.2
7,7,7.0,2021-01-06,26-296,PHI,,WAS,W (+5),1,36.6,11.0,20.0,0.55,3.0,4.0,0.75,13.0,13.0,1.0,0.0,8.0,8.0,5.0,3.0,3.0,5.0,1.0,38.0,34.0,14.0,68.1
8,8,8.0,2021-01-07,26-297,PHI,@,BRK,L (-13),1,29.9,7.0,14.0,0.5,2.0,2.0,1.0,4.0,5.0,0.8,0.0,12.0,12.0,3.0,0.0,1.0,5.0,5.0,20.0,12.0,-16.0,36.9
9,9,,2021-01-09,26-299,PHI,,DEN,L (-12),Inactive,0.0,,,,,,,,,,,,,,,,,,,,,


In [66]:
# binary for game played vs not played
df_embiid['g'] = [1 if pd.isna(df_embiid.G[gm])==False else 0 for gm in range(len(df_embiid))]
df_embiid.head()

Unnamed: 0.1,Unnamed: 0,G,Date,Age,Tm,game_loc,Opp,outcome,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,FPTS,g
0,0,1.0,2020-12-23,26-282,PHI,,WAS,W (+6),1,34.8,10.0,17.0,0.588,1.0,1.0,1.0,8.0,9.0,0.889,2.0,12.0,14.0,2.0,0.0,1.0,3.0,3.0,29.0,23.6,-2.0,48.8,1
1,1,2.0,2020-12-26,26-285,PHI,@,NYK,W (+20),1,30.6,10.0,20.0,0.5,2.0,4.0,0.5,5.0,7.0,0.714,4.0,6.0,10.0,2.0,1.0,1.0,1.0,3.0,27.0,21.7,18.0,47.0,1
2,2,,2020-12-27,26-286,PHI,@,CLE,L (-24),Did Not Dress,0.0,,,,,,,,,,,,,,,,,,,,,,0
3,3,3.0,2020-12-29,26-288,PHI,,TOR,W (+7),1,37.9,7.0,17.0,0.412,1.0,6.0,0.167,14.0,16.0,0.875,4.0,12.0,16.0,4.0,2.0,2.0,5.0,1.0,29.0,26.3,15.0,61.2,1
4,4,4.0,2020-12-31,26-290,PHI,@,ORL,W (+24),1,26.7,7.0,11.0,0.636,1.0,2.0,0.5,6.0,8.0,0.75,0.0,9.0,9.0,2.0,0.0,2.0,0.0,2.0,21.0,20.0,23.0,40.8,1


In [67]:
# change column name
column_names = {'': 'game_loc', '.1': 'outcome'}
df_embiid.columns = df_embiid.columns.str.strip()
df_embiid = df_embiid.rename(columns = column_names)
df_embiid.columns

Index(['Unnamed: 0', 'G', 'Date', 'Age', 'Tm', 'game_loc', 'Opp', 'outcome',
       'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc',
       '+/-', 'FPTS', 'g'],
      dtype='object')

In [68]:
df_embiid['HOME_GAME'] = [1 if pd.isna(df_embiid.game_loc[gm])==True else 0 for gm in range(len(df_embiid))]
df_embiid.drop(['game_loc'], inplace=True, axis=1)
df_embiid.head()

Unnamed: 0.1,Unnamed: 0,G,Date,Age,Tm,Opp,outcome,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,FPTS,g,HOME_GAME
0,0,1.0,2020-12-23,26-282,PHI,WAS,W (+6),1,34.8,10.0,17.0,0.588,1.0,1.0,1.0,8.0,9.0,0.889,2.0,12.0,14.0,2.0,0.0,1.0,3.0,3.0,29.0,23.6,-2.0,48.8,1,1
1,1,2.0,2020-12-26,26-285,PHI,NYK,W (+20),1,30.6,10.0,20.0,0.5,2.0,4.0,0.5,5.0,7.0,0.714,4.0,6.0,10.0,2.0,1.0,1.0,1.0,3.0,27.0,21.7,18.0,47.0,1,0
2,2,,2020-12-27,26-286,PHI,CLE,L (-24),Did Not Dress,0.0,,,,,,,,,,,,,,,,,,,,,,0,0
3,3,3.0,2020-12-29,26-288,PHI,TOR,W (+7),1,37.9,7.0,17.0,0.412,1.0,6.0,0.167,14.0,16.0,0.875,4.0,12.0,16.0,4.0,2.0,2.0,5.0,1.0,29.0,26.3,15.0,61.2,1,1
4,4,4.0,2020-12-31,26-290,PHI,ORL,W (+24),1,26.7,7.0,11.0,0.636,1.0,2.0,0.5,6.0,8.0,0.75,0.0,9.0,9.0,2.0,0.0,2.0,0.0,2.0,21.0,20.0,23.0,40.8,1,0


In [69]:
# add back-to-back indicator
# df_embiid['back_to_back'] = [1 if ]
df_embiid['Date'] = pd.to_datetime(df_embiid['Date'])

# making column out of the difference in days. 
df_embiid['back_to_back'] = pd.to_numeric(df_embiid.Date.dt.day.diff(), downcast='integer', errors='coerce')


# need to make a column where 1 for back to back and zero for not
df_embiid['back_to_back'] = [1 if (df_embiid.back_to_back[x] == 1.0) == True else 0 for x in range(len(df_embiid))]




df_embiid.head()

Unnamed: 0.1,Unnamed: 0,G,Date,Age,Tm,Opp,outcome,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,FPTS,g,HOME_GAME,back_to_back
0,0,1.0,2020-12-23,26-282,PHI,WAS,W (+6),1,34.8,10.0,17.0,0.588,1.0,1.0,1.0,8.0,9.0,0.889,2.0,12.0,14.0,2.0,0.0,1.0,3.0,3.0,29.0,23.6,-2.0,48.8,1,1,0
1,1,2.0,2020-12-26,26-285,PHI,NYK,W (+20),1,30.6,10.0,20.0,0.5,2.0,4.0,0.5,5.0,7.0,0.714,4.0,6.0,10.0,2.0,1.0,1.0,1.0,3.0,27.0,21.7,18.0,47.0,1,0,0
2,2,,2020-12-27,26-286,PHI,CLE,L (-24),Did Not Dress,0.0,,,,,,,,,,,,,,,,,,,,,,0,0,1
3,3,3.0,2020-12-29,26-288,PHI,TOR,W (+7),1,37.9,7.0,17.0,0.412,1.0,6.0,0.167,14.0,16.0,0.875,4.0,12.0,16.0,4.0,2.0,2.0,5.0,1.0,29.0,26.3,15.0,61.2,1,1,0
4,4,4.0,2020-12-31,26-290,PHI,ORL,W (+24),1,26.7,7.0,11.0,0.636,1.0,2.0,0.5,6.0,8.0,0.75,0.0,9.0,9.0,2.0,0.0,2.0,0.0,2.0,21.0,20.0,23.0,40.8,1,0,0


In [70]:
# getting wins into binary
df_embiid['win'] = [1 if df_embiid['outcome'][gm][:1] == 'W' else 0 for gm in range(len(df_embiid))]
sum(df_embiid.win)

12

In [None]:
# change 'outcome' to just a positive or negative integer
