<a href="https://colab.research.google.com/github/Sethicus-Millicus/basketball_data/blob/web-scraping-edits/pull_college_gamelogs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web scraping Tutorial

* This notebook is a quick reference on how to use beautifulsoup4 to download data from websites- specifically basketball-reference.com

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

from urllib.request import urlopen
import os

In [2]:
url = 'https://www.sports-reference.com/cbb/players/matthew-hurt-1/gamelog/2021'
page = requests.get(url)
html = urlopen(url)
soup = BeautifulSoup(html)

# using findAll toget the column headers
soup.findAll('thead')
headers = [th.getText() for th in soup.findAll('thead')[0].findAll('th')]

#remove extra header
headers.remove('Rk')


table_body = soup.tbody
#table_body is used to grab all tr in tbody

rows = table_body.findAll('tr')
#rows

player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
#player_stats

gamelog = pd.DataFrame(player_stats, columns = headers)
#gamelog is the DF putting it all together



In [3]:
gamelog = gamelog.apply(pd.to_numeric, errors='ignore')
gamelog['Date'] = pd.to_datetime(gamelog['Date'])

gamelog["FPTS"] = (gamelog.PTS) + (gamelog.TRB*1.2) + (gamelog.AST*1.5) + (gamelog.STL*3) + (gamelog.BLK*3) + (gamelog.TOV*-1)

gamelog.columns.values[2] = "game_loc"
gamelog.columns.values[5] = "W_L"

# replace blank spaces with nan
gamelog = gamelog.replace(r'^\s*$', np.nan, regex=True)

gamelog['Home'] = [1 if pd.isna(gamelog.game_loc[gm])==True else 0 for gm in range(len(gamelog))]
gamelog.drop(['game_loc'], inplace=True, axis=1)

gamelog['W'] = [1 if gamelog['W_L'][gm] == 'W' else 0 for gm in range(len(gamelog))]


In [4]:
# function to get normalized stats
def stat_per_time(stat, minutes, per=36):
  if minutes == 0:
    return 0
  else:
    val_min = stat/minutes
    return round(val_min * per, 1)

# I want to see the normalized stat for fantasy points. but this can be used for any column. 

gamelog['fpts_per_36'] = gamelog.apply(lambda row: stat_per_time(stat = row['FPTS'], minutes = row['MP']), axis=1)

In [5]:
gamelog

Unnamed: 0,Date,School,Opponent,Type,W_L,GS,MP,FG,FGA,FG%,...,AST,STL,BLK,TOV,PF,PTS,FPTS,Home,W,fpts_per_36
0,2020-11-28,Duke,Coppin State,REG,W,1,29,5,11,0.455,...,1,1,0,1,4,12,23.9,1,1,29.7
1,2020-12-01,Duke,Michigan State,REG,L,1,36,6,14,0.429,...,0,0,0,1,2,21,35.6,1,0,35.6
2,2020-12-04,Duke,Bellarmine,REG,W,1,32,9,12,0.75,...,2,1,1,1,3,24,39.2,1,1,44.1
3,2020-12-08,Duke,Illinois,REG,L,1,30,8,15,0.533,...,0,1,1,0,4,19,33.4,1,0,40.1
4,2020-12-16,Duke,Notre Dame,REG,W,1,38,8,17,0.471,...,3,1,2,1,2,18,36.5,0,1,34.6
5,2021-01-06,Duke,Boston College,REG,W,1,32,7,15,0.467,...,1,0,1,2,4,17,32.7,1,1,36.8
6,2021-01-09,Duke,Wake Forest,REG,W,1,31,10,15,0.667,...,2,3,0,3,4,26,42.2,1,1,49.0
7,2021-01-12,Duke,Virginia Tech,REG,L,1,38,8,16,0.5,...,0,1,2,2,3,20,40.2,0,0,38.1
8,2021-01-19,Duke,Pitt,REG,L,1,30,5,12,0.417,...,4,0,1,0,3,13,29.2,0,0,35.0
9,2021-01-23,Duke,Louisville,REG,L,1,35,9,13,0.692,...,0,0,0,3,5,24,30.6,0,0,31.5


In [6]:
gamelog.to_csv("cbb_gamelogs/matthew_hurt_gamelog.csv")

In [7]:
df = pd.read_csv("cbb_gamelogs/matthew_hurt_gamelog.csv")
