# Scraping Stats off of Pro Football Refrence
#### ------------------------------------------------------------
The idea of this program is to go into PFR, and take the team stats, game info, and final score of each game of a given teams season. All you need to do is put in the year and team (3 letter abbrievation), then the program will create a csv of that given teams data. and put it into the correct file

In [1]:
currTeamAbbriev = "BUF" 
currTeamFileName = "Bills" #IE Bills or Bengals
year = 2020

In [2]:
#importing driver for webpage
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
#importing parser
from bs4 import BeautifulSoup
#importing pandas for dataframes
import pandas as pd
#delay so java script has time to load
import time
#numpy is always useful
import numpy as np
#used for pandas
from functools import reduce

In [None]:
def getGameInfo(currID, driver):
    """
    Scrapes the game info off of PFR Including vegas lines,
        if the game was played outdoors, wind speeds ect.

    Args:
        currID (string): the HTML ID corrosponding to the targeted table
        driver (API): Selenium webdriver, specifically for chrome 

    Returns:
        dfStat: Pandas Data-Frame of all the scraped information
    """
    headersArr = []
    #reads through page
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    #get the passed in id in link form
    table = soup.find(id = currID)
    #strips *table header* elements
    headers = [th.text.strip() for th in table.find_all('th')]

    headersArr.append(headers)
    columnArr = []
    rows = []
    #strips all rows for each team
    for row in table.find_all('tr')[1:]:
        #td holds info, tr is the stucture
        for td in row.find_all('td'):
            columnArr.append(td.text.strip())
    if columnArr:
        rows.append(columnArr)

    dfStat = pd.DataFrame(rows, columns=np.transpose(headers))
    return dfStat


def returnEndScores(currID, driver):
    """
    Scrapes the game stats for both current team of interest and their opponent off of PFR.

    Args:
        currID (string): the HTML ID corrosponding to the targeted table
        driver (API): Selenium webdriver, specifically for chrome 

    Returns:
        dfStat: Pandas Data-Frame of all the scraped information
    """
    homeTeamBool = False
    headersArr = []
    #reads through page
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    #get the passed in id in link form
    table = soup.find(id = currID)

    #strips *table header* elements
    headers = [th.text.strip() for th in table.find_all('th')]
    #appends it to an empty array
    headersArr.append(headers)

    #strips all rows for each team
    headers = []
    
    thead = table.find("thead")
    homeTeam = thead.find("th", {"data-stat": "home_team_score"}).text.strip()
    if homeTeam == currTeamAbbriev:
        homeTeamBool = True
        print(f"{currTeamAbbriev} is home")
    else:
        print(f"{currTeamAbbriev} is away")

    tbody = table.find("tbody")
    visitor_team = tbody.find_all("td", {"data-stat": "vis_team_score"})[-1].text.strip()
    home_team = tbody.find_all("td", {"data-stat": "home_team_score"})[-1].text.strip()

    return visitor_team, home_team, homeTeamBool


def getTeamStats(currID, driver):  
    defensiveHeaders = []
    headersArr = []
    #reads through page
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    #get the passed in id in link form
    table = soup.find(id = currID)

    #strips *table header* elements
    headers = [th.text.strip() for th in table.find_all('th')]
    
    headersArr.append(headers)

    #strips all rows for each team
    headers = []
    visitor_vals = []
    home_vals = []


    for row in table.find_all('tr'):
        stat_name = row.find('th').text.strip()
        cols = row.find_all('td')
        if len(cols) == 2:
            headers.append(stat_name)
            visitor_vals.append(cols[0].text.strip())
            home_vals.append(cols[1].text.strip())

    
    visitorScore, HomeScore, homeTeamBool = returnEndScores("scoring", driver)
    
    defensiveHeaders = headers + [f"Opp_{h}" for h in headers]  # now double headers

    if homeTeamBool:
        home_vals = home_vals + visitor_vals
        dfStat = pd.DataFrame([home_vals],
                        index=[currTeamAbbriev],
                        columns=defensiveHeaders)
        dfStat["HomeTeam"] = True
        dfStat[f"{currTeamAbbriev} Score"] = HomeScore
        dfStat[f"Opp Score"] = visitorScore
    else:
        visitor_vals = visitor_vals + home_vals
        dfStat = pd.DataFrame([visitor_vals],
                        index= [currTeamAbbriev],
                        columns=defensiveHeaders)
        dfStat["HomeTeam"] = False
        dfStat[f"{currTeamAbbriev} Score"] = visitorScore
        dfStat[f"Opp Score"] = HomeScore
    return dfStat

In [None]:
lowercaseTeamAbbriev = currTeamAbbriev.lower()
#options used later to simplify arguments
# Set up headless browser which stops any GUI from apearing
options = Options()
options.add_argument('--headless')
#stops Graphics processing unit from rendering the web content
options.add_argument('--disable-gpu')

#sets up remote control interface to instruct the behavior of web browsers
#basically sets up functionallity to scrape web
driver = webdriver.Chrome(options=options)

#url that changes bassed on the inputed information that is easy to change. 
url = f'https://www.pro-football-reference.com/teams/{lowercaseTeamAbbriev}/{year}.htm'
driver.get(url)

headersArr = []

#reads through page
soup = BeautifulSoup(driver.page_source, 'html.parser')
table = soup.find(id = "games")
#strips *table header* elements
headers = [th.text.strip() for th in table.find_all('th')]
headersArr.append(headers)
#strips all rows for each team
rows = []
boxscore_links = []
for row in table.find_all('tr')[1:]:
    # finds each boxscore link on a teams season page
    boxscore_cell = row.find('td', {'data-stat': 'boxscore_word'})
    if boxscore_cell and boxscore_cell.find('a'):
        #
        link = boxscore_cell.find('a')['href']
        #makes the full link then appends it to list
        full_link = f"https://www.pro-football-reference.com{link}"
        boxscore_links.append(full_link)


masterInfoDF = pd.DataFrame()
masterStatsDF = pd.DataFrame()


for i in range(len(boxscore_links)):
    driver.get(boxscore_links[i])

    gameInfoDF = getGameInfo("game_info", driver)
    gameStatsDF = getTeamStats("team_stats", driver)

    gameInfoDF.columns = pd.MultiIndex.from_product([[f"Game{i}"], gameInfoDF.columns])
    gameStatsDF.columns = pd.MultiIndex.from_product([[f"Game{i}"], gameStatsDF.columns])

    masterInfoDF = pd.concat([masterInfoDF, gameInfoDF], axis=1)
    masterStatsDF = pd.concat([masterStatsDF, gameStatsDF], axis=1)
    print(f"Week {i} is complete")

masterInfoDF = masterInfoDF.stack()
masterStatsDF = masterStatsDF.stack()




['Bills (deferred)', 'outdoors', 'astroturf', '3:10', '67 degrees, relative humidity 93%, wind 15 mph', 'Buffalo Bills -6.5', '39.5 (over)']
BUF is home
Week 0 is complete
['Dolphins (deferred)', 'outdoors', 'grass', '3:40', '11,075', '90 degrees, relative humidity 68%, wind 5 mph', 'Buffalo Bills -5.5', '42.5 (over)']
BUF is away
Week 1 is complete
['Rams (deferred)', 'outdoors', 'astroturf', '3:04', '79 degrees, relative humidity 52%, wind 16 mph', 'Buffalo Bills -1.5', '46.5 (over)']
BUF is home
Week 2 is complete
['Raiders (deferred)', 'dome', 'grass', '3:12', 'Buffalo Bills -3.0', '53.0 (push)']
BUF is away
Week 3 is complete
['Titans (deferred)', 'outdoors', 'grass', '2:55', '8,403', '73 degrees, relative humidity 32%, wind 3 mph', 'Buffalo Bills -3.0', '52.0 (over)']
BUF is away
Week 4 is complete
['Chiefs (deferred)', 'outdoors', 'astroturf', '2:53', '51 degrees, relative humidity 92%, wind 6 mph', 'Kansas City Chiefs -5.5', '55.0 (under)']
BUF is home
Week 5 is complete
['Jets

  masterInfoDF = masterInfoDF.stack()
  masterStatsDF = masterStatsDF.stack()


In [5]:
masterStatsDF.to_csv(f'Stats/{currTeamFileName}/{currTeamAbbriev}TeamStats{year}.csv', index=True)
masterInfoDF.to_csv(f'Stats/{currTeamFileName}/{currTeamAbbriev}GameInfo{year}.csv', index=True)