<a href="https://colab.research.google.com/github/Sethicus-Millicus/basketball_data/blob/web-scraping-edits/pull_college_gamelogs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web scraping Tutorial

* This notebook is a quick reference on how to use beautifulsoup4 to download data from websites- specifically basketball-reference.com

In [61]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.request import urlopen
import os

In [62]:
url = 'https://www.sports-reference.com/cbb/players/matthew-hurt-1/gamelog/2021'
page = requests.get(url)
page

<Response [200]>

In [63]:
html = urlopen(url)
html

<http.client.HTTPResponse at 0x7fa795cfc1d0>

In [64]:
soup = BeautifulSoup(html)
#print(soup.prettify())

In [65]:
# using findAll toget the column headers

soup.findAll('thead')

headers = [th.getText() for th in soup.findAll('thead')[0].findAll('th')]
headers 

['Rk',
 'Date',
 'School',
 '\xa0',
 'Opponent',
 'Type',
 '\xa0',
 'GS',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '2P',
 '2PA',
 '2P%',
 '3P',
 '3PA',
 '3P%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS']

In [66]:
headers.remove('Rk')
#remove extra header
headers

['Date',
 'School',
 '\xa0',
 'Opponent',
 'Type',
 '\xa0',
 'GS',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '2P',
 '2PA',
 '2P%',
 '3P',
 '3PA',
 '3P%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS']

In [67]:
#stats = [[td.gettext() for td in soup.findAll('td', {'data-stat': stat})] for stat in headers]
# shrink it down to just the data table stats

# this table_body returns as a list
#table_body = soup.findAll('tbody')
#table_body

#def table_body(soup):
   # return soup.has_attr('data-stat')
#rows = table_body.find_all('tr')

table_body = soup.tbody
#table_body

rows = table_body.findAll('tr')
#rows


In [68]:
#rk = soup.find_all(attrs={'data-stat':'ranker'})
#rk


In [69]:
player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
#player_stats

In [70]:
gamelog = pd.DataFrame(player_stats, columns = headers)
#gamelog

In [71]:
gamelog.head()

Unnamed: 0,Date,School,Unnamed: 3,Opponent,Type,Unnamed: 6,GS,MP,FG,FGA,FG%,2P,2PA,2P%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,2020-11-28,Duke,,Coppin State,REG,W,1,29,5,11,0.455,3,6,0.5,2,5,0.4,0,2,0.0,2,5,7,1,1,0,1,4,12
1,2020-12-01,Duke,,Michigan State,REG,L,1,36,6,14,0.429,5,11,0.455,1,3,0.333,8,8,1.0,2,11,13,0,0,0,1,2,21
2,2020-12-04,Duke,,Bellarmine,REG,W,1,32,9,12,0.75,3,4,0.75,6,8,0.75,0,0,,0,6,6,2,1,1,1,3,24
3,2020-12-08,Duke,,Illinois,REG,L,1,30,8,15,0.533,8,9,0.889,0,6,0.0,3,5,0.6,3,4,7,0,1,1,0,4,19
4,2020-12-16,Duke,@,Notre Dame,REG,W,1,38,8,17,0.471,8,15,0.533,0,2,0.0,2,2,1.0,1,4,5,3,1,2,1,2,18


In [72]:
gamelog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 29 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      12 non-null     object
 1   School    12 non-null     object
 2             12 non-null     object
 3   Opponent  12 non-null     object
 4   Type      12 non-null     object
 5             12 non-null     object
 6   GS        12 non-null     object
 7   MP        12 non-null     object
 8   FG        12 non-null     object
 9   FGA       12 non-null     object
 10  FG%       12 non-null     object
 11  2P        12 non-null     object
 12  2PA       12 non-null     object
 13  2P%       12 non-null     object
 14  3P        12 non-null     object
 15  3PA       12 non-null     object
 16  3P%       12 non-null     object
 17  FT        12 non-null     object
 18  FTA       12 non-null     object
 19  FT%       12 non-null     object
 20  ORB       12 non-null     object
 21  DRB       12 non-n

In [73]:
# Need to convert data type to int



gamelog = gamelog.apply(pd.to_numeric, errors='ignore')
gamelog['Date'] = pd.to_datetime(gamelog['Date'])
gamelog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 29 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Date      12 non-null     datetime64[ns]
 1   School    12 non-null     object        
 2             12 non-null     object        
 3   Opponent  12 non-null     object        
 4   Type      12 non-null     object        
 5             12 non-null     object        
 6   GS        12 non-null     int64         
 7   MP        12 non-null     int64         
 8   FG        12 non-null     int64         
 9   FGA       12 non-null     int64         
 10  FG%       12 non-null     float64       
 11  2P        12 non-null     int64         
 12  2PA       12 non-null     int64         
 13  2P%       12 non-null     float64       
 14  3P        12 non-null     int64         
 15  3PA       12 non-null     int64         
 16  3P%       12 non-null     float64       
 17  FT        12 non-n

In [74]:
gamelog["FPTS"] = (gamelog.PTS) + (gamelog.TRB*1.2) + (gamelog.AST*1.5) + (gamelog.STL*3) + (gamelog.BLK*3) + (gamelog.TOV*-1)


In [75]:
gamelog.columns

Index(['Date', 'School', ' ', 'Opponent', 'Type', ' ', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '2P', '2PA', '2P%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'FPTS'],
      dtype='object')

In [78]:
# change column name
#column_names = {'': 'Set', '.1': 'W_L'}
#gamelog.columns = gamelog.columns.str.strip()
#gamelog = gamelog.rename(columns = column_names)
#gamelog.columns


gamelog.columns.values[2] = "game_loc"
gamelog.columns.values[5] = "W_L"
gamelog.columns

Index(['Date', 'School', 'game_loc', 'Opponent', 'Type', 'W_L', 'GS', 'MP',
       'FG', 'FGA', 'FG%', '2P', '2PA', '2P%', '3P', '3PA', '3P%', 'FT', 'FTA',
       'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'FPTS'],
      dtype='object')

In [79]:
gamelog['Home_Game'] = [1 if pd.isna(gamelog.game_loc[gm])==True else 0 for gm in range(len(gamelog))]
gamelog.drop(['game_loc'], inplace=True, axis=1)
gamelog.head()

AttributeError: ignored

In [None]:
# binary for game played vs not played
df_embiid['g'] = [1 if pd.isna(df_embiid.G[gm])==False else 0 for gm in range(len(df_embiid))]
df_embiid.head()

Unnamed: 0.1,Unnamed: 0,G,Date,Age,Tm,Opp,outcome,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,FPTS,g,HOME_GAME,back_to_back,win
0,0,1.0,2020-12-23,26-282,PHI,WAS,W (+6),1,34.8,10.0,17.0,0.588,1.0,1.0,1.0,8.0,9.0,0.889,2.0,12.0,14.0,2.0,0.0,1.0,3.0,3.0,29.0,23.6,-2.0,48.8,1,1,0,1
1,1,2.0,2020-12-26,26-285,PHI,NYK,W (+20),1,30.6,10.0,20.0,0.5,2.0,4.0,0.5,5.0,7.0,0.714,4.0,6.0,10.0,2.0,1.0,1.0,1.0,3.0,27.0,21.7,18.0,47.0,1,0,0,1
2,2,,2020-12-27,26-286,PHI,CLE,L (-24),Did Not Dress,0.0,,,,,,,,,,,,,,,,,,,,,,0,0,1,0
3,3,3.0,2020-12-29,26-288,PHI,TOR,W (+7),1,37.9,7.0,17.0,0.412,1.0,6.0,0.167,14.0,16.0,0.875,4.0,12.0,16.0,4.0,2.0,2.0,5.0,1.0,29.0,26.3,15.0,61.2,1,1,0,1
4,4,4.0,2020-12-31,26-290,PHI,ORL,W (+24),1,26.7,7.0,11.0,0.636,1.0,2.0,0.5,6.0,8.0,0.75,0.0,9.0,9.0,2.0,0.0,2.0,0.0,2.0,21.0,20.0,23.0,40.8,1,0,0,1


In [None]:
df_embiid['HOME_GAME'] = [1 if pd.isna(df_embiid.game_loc[gm])==True else 0 for gm in range(len(df_embiid))]
df_embiid.drop(['game_loc'], inplace=True, axis=1)
df_embiid.head()

AttributeError: ignored

In [None]:
# add back-to-back indicator
# df_embiid['back_to_back'] = [1 if ]
df_embiid['Date'] = pd.to_datetime(df_embiid['Date'])

# making column out of the difference in days. 
df_embiid['back_to_back'] = pd.to_numeric(df_embiid.Date.dt.day.diff(), downcast='integer', errors='coerce')


# need to make a column where 1 for back to back and zero for not
df_embiid['back_to_back'] = [1 if (df_embiid.back_to_back[x] == 1.0) == True else 0 for x in range(len(df_embiid))]




df_embiid.head()

Unnamed: 0.1,Unnamed: 0,G,Date,Age,Tm,Opp,outcome,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,FPTS,g,HOME_GAME,back_to_back,win
0,0,1.0,2020-12-23,26-282,PHI,WAS,W (+6),1,34.8,10.0,17.0,0.588,1.0,1.0,1.0,8.0,9.0,0.889,2.0,12.0,14.0,2.0,0.0,1.0,3.0,3.0,29.0,23.6,-2.0,48.8,1,1,0,1
1,1,2.0,2020-12-26,26-285,PHI,NYK,W (+20),1,30.6,10.0,20.0,0.5,2.0,4.0,0.5,5.0,7.0,0.714,4.0,6.0,10.0,2.0,1.0,1.0,1.0,3.0,27.0,21.7,18.0,47.0,1,0,0,1
2,2,,2020-12-27,26-286,PHI,CLE,L (-24),Did Not Dress,0.0,,,,,,,,,,,,,,,,,,,,,,0,0,1,0
3,3,3.0,2020-12-29,26-288,PHI,TOR,W (+7),1,37.9,7.0,17.0,0.412,1.0,6.0,0.167,14.0,16.0,0.875,4.0,12.0,16.0,4.0,2.0,2.0,5.0,1.0,29.0,26.3,15.0,61.2,1,1,0,1
4,4,4.0,2020-12-31,26-290,PHI,ORL,W (+24),1,26.7,7.0,11.0,0.636,1.0,2.0,0.5,6.0,8.0,0.75,0.0,9.0,9.0,2.0,0.0,2.0,0.0,2.0,21.0,20.0,23.0,40.8,1,0,0,1


In [None]:
# getting wins into binary
df_embiid['win'] = [1 if df_embiid['outcome'][gm][:1] == 'W' else 0 for gm in range(len(df_embiid))]
sum(df_embiid.win)

12

In [None]:
# would like to change 'outcome' to just a positive or negative integer


In [None]:
# function to get normalized stats
def stat_per_time(stat, minutes, per=36):
  if minutes == 0:
    return 0
  else:
    val_min = stat/minutes
    return round(val_min * per, 1)

# I want to see the normalized stat for fantasy points. but this can be used for any column. 

df_embiid['fpts_per_36'] = df_embiid.apply(lambda row: stat_per_time(stat = row['FPTS'], minutes = row['MP']), axis=1)


In [None]:
df_embiid

Unnamed: 0.1,Unnamed: 0,G,Date,Age,Tm,Opp,outcome,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,FPTS,g,HOME_GAME,back_to_back,win,fpts_per_36
0,0,1.0,2020-12-23,26-282,PHI,WAS,W (+6),1,34.8,10.0,17.0,0.588,1.0,1.0,1.0,8.0,9.0,0.889,2.0,12.0,14.0,2.0,0.0,1.0,3.0,3.0,29.0,23.6,-2.0,48.8,1,1,0,1,50.5
1,1,2.0,2020-12-26,26-285,PHI,NYK,W (+20),1,30.6,10.0,20.0,0.5,2.0,4.0,0.5,5.0,7.0,0.714,4.0,6.0,10.0,2.0,1.0,1.0,1.0,3.0,27.0,21.7,18.0,47.0,1,0,0,1,55.3
2,2,,2020-12-27,26-286,PHI,CLE,L (-24),Did Not Dress,0.0,,,,,,,,,,,,,,,,,,,,,,0,0,1,0,0.0
3,3,3.0,2020-12-29,26-288,PHI,TOR,W (+7),1,37.9,7.0,17.0,0.412,1.0,6.0,0.167,14.0,16.0,0.875,4.0,12.0,16.0,4.0,2.0,2.0,5.0,1.0,29.0,26.3,15.0,61.2,1,1,0,1,58.1
4,4,4.0,2020-12-31,26-290,PHI,ORL,W (+24),1,26.7,7.0,11.0,0.636,1.0,2.0,0.5,6.0,8.0,0.75,0.0,9.0,9.0,2.0,0.0,2.0,0.0,2.0,21.0,20.0,23.0,40.8,1,0,0,1,55.0
5,5,5.0,2021-01-02,26-292,PHI,CHO,W (+15),1,36.7,7.0,11.0,0.636,1.0,2.0,0.5,4.0,6.0,0.667,3.0,11.0,14.0,4.0,1.0,1.0,5.0,2.0,19.0,17.4,19.0,42.8,1,1,0,1,42.0
6,6,6.0,2021-01-04,26-294,PHI,CHO,W (+17),1,24.9,4.0,10.0,0.4,0.0,3.0,0.0,6.0,8.0,0.75,1.0,10.0,11.0,4.0,2.0,3.0,1.0,1.0,14.0,17.0,18.0,47.2,1,1,0,1,68.2
7,7,7.0,2021-01-06,26-296,PHI,WAS,W (+5),1,36.6,11.0,20.0,0.55,3.0,4.0,0.75,13.0,13.0,1.0,0.0,8.0,8.0,5.0,3.0,3.0,5.0,1.0,38.0,34.0,14.0,68.1,1,1,0,1,67.0
8,8,8.0,2021-01-07,26-297,PHI,BRK,L (-13),1,29.9,7.0,14.0,0.5,2.0,2.0,1.0,4.0,5.0,0.8,0.0,12.0,12.0,3.0,0.0,1.0,5.0,5.0,20.0,12.0,-16.0,36.9,1,0,1,0,44.4
9,9,,2021-01-09,26-299,PHI,DEN,L (-12),Inactive,0.0,,,,,,,,,,,,,,,,,,,,,,0,1,0,0,0.0


In [None]:
df_embiid.to_csv("joelembiid_gamelog.csv")