# Scraping Stats off of Pro Football Refrence
#### ------------------------------------------------------------
The idea of this program is to go into PFR, and take the team stats, game info, and final score of each game of a given teams season. All you need to do is put in the year and team (3 letter abbrievation), then the program will create a csv of that given teams data. and put it into the correct file

In [6]:
currTeamAbbriev = "sfo" 
currTeamFileName = "49ers" #IE Bills or Bengals
year = 2017

In [7]:
#importing driver for webpage
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
#importing parser
from bs4 import BeautifulSoup
#importing pandas for dataframes
import pandas as pd
#delay so java script has time to load
import time
#numpy is always useful
import numpy as np
#used for pandas
from functools import reduce

import time



In [8]:
def getGameInfo(currID, driver):
    """
    Scrapes the game info off of PFR Including vegas lines,
        if the game was played outdoors, wind speeds ect.

    Args:
        currID (string): the HTML ID corrosponding to the targeted table
        driver (API): Selenium webdriver, specifically for chrome 

    Returns:
        dfStat: Pandas Data-Frame of all the scraped information
    """
    headersArr = []
    #reads through page
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    #get the passed in id in link form
    table = soup.find(id = currID)
    #strips *table header* elements
    headers = [th.text.strip() for th in table.find_all('th')]

    headersArr.append(headers)
    columnArr = []
    rows = []
    #strips all rows for each team
    for row in table.find_all('tr')[1:]:
        #td holds info, tr is the stucture
        for td in row.find_all('td'):
            columnArr.append(td.text.strip())
    if columnArr:
        rows.append(columnArr)

    dfStat = pd.DataFrame(rows, columns=np.transpose(headers))
    return dfStat


def returnEndScores(currID, driver):
    """
    Scrapes the final score of the game and whether the team of interest was home.

    Args:
        currID (string): the HTML ID corrosponding to the targeted table
        driver (API): Selenium webdriver, specifically for chrome 

    Returns:
        visitor_team (int): final score of the visiting team
        home_team (int): final score of the home team
        homeTeamBool (Bool): if the current team of interest is home. 
    """

    homeTeamBool = False
    headersArr = []
    #reads through page
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    #get the passed in id in link form
    table = soup.find(id = currID)

    #strips *table header* elements
    headers = [th.text.strip() for th in table.find_all('th')]
    #appends it to an empty array
    headersArr.append(headers)

    #strips all rows for each team
    headers = []
    
    #Pulling the team name of the home team.
    #Used to determine is current team is home or away. 
    thead = table.find("thead")
    homeTeam = thead.find("th", {"data-stat": "home_team_score"}).text.strip()
    if homeTeam == currTeamAbbriev:
        homeTeamBool = True
        print(f"{currTeamAbbriev} is home")
    else:
        print(f"{currTeamAbbriev} is away")

    #pulling the last entry into the score table, meaning the final score of both the 
    # home and visitors final scores 
    tbody = table.find("tbody")
    visitor_team = tbody.find_all("td", {"data-stat": "vis_team_score"})[-1].text.strip()
    home_team = tbody.find_all("td", {"data-stat": "home_team_score"})[-1].text.strip()

    return visitor_team, home_team, homeTeamBool


def getTeamStats(currID, driver):  
   
    """
    Scrapes the game stats off of PFR for both the team of interest and their opponent
        Includes yards, turnovers, possesion time, ect. 

    Args:
        currID (string): the HTML ID corrosponding to the targeted table
        driver (API): Selenium webdriver, specifically for chrome 

    Returns:
        dfStat: Pandas Data-Frame of all the scraped information
    """
    defensiveHeaders = []
    headersArr = []
    #reads through page
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    #get the passed in id in link form
    table = soup.find(id = currID)

    #strips *table header* elements
    headers = [th.text.strip() for th in table.find_all('th')]
    
    headersArr.append(headers)

    #gets the name of each column in the table
    headers = []
    #will hold all of the values for the vistor team 
    visitor_vals = [] 
    #will hold all of the values for the home team 
    home_vals = []


    #For each entry in the table
    for row in table.find_all('tr'):
        #look at the header of the row to get what stat is representing
        stat_name = row.find('th').text.strip()
        #Get the actual information if it exists
        cols = row.find_all('td')

        #If the information exists then append relevant info to corosponding array
        if len(cols) == 2:
            #append what the stat means
            headers.append(stat_name)
            #the visitor value
            visitor_vals.append(cols[0].text.strip())
            #the home team values 
            home_vals.append(cols[1].text.strip())

    #Calls return end scores and holds it there, returns if team of interest was home
    visitorScore, HomeScore, homeTeamBool = returnEndScores("scoring", driver)
    
    # Want all the data in one row so make an opponent version of the headers. 
    defensiveHeaders = headers + [f"Opp_{h}" for h in headers]  # now double headers


    #Team of interest is always printed first 

    #if team of interest was home, 
    if homeTeamBool:
        #combine two rows of data into one 
        home_vals = home_vals + visitor_vals

        #creates the datafram 
        dfStat = pd.DataFrame([home_vals],
                        index=[currTeamAbbriev],
                        columns=defensiveHeaders)
        
        #Stores if team of interest was home
        dfStat["HomeTeam"] = True

        #How much the team of interest Scored
        dfStat[f"{currTeamAbbriev} Score"] = HomeScore

        #how much the opponents scored
        dfStat[f"Opp Score"] = visitorScore
    else:
        #switches stuff around specifically so team of interest is 
        #always first, makes the ML easier. Otherwise same as above
        visitor_vals = visitor_vals + home_vals
        dfStat = pd.DataFrame([visitor_vals],
                        index= [currTeamAbbriev],
                        columns=defensiveHeaders)
        dfStat["HomeTeam"] = False
        dfStat[f"{currTeamAbbriev} Score"] = visitorScore
        dfStat[f"Opp Score"] = HomeScore
    #return fininalized dataframe. 
    return dfStat

In [9]:
lowercaseTeamAbbriev = currTeamAbbriev.lower()
#options used later to simplify arguments
# Set up headless browser which stops any GUI from apearing
options = Options()
options.add_argument('--headless')
#stops Graphics processing unit from rendering the web content
options.add_argument('--disable-gpu')

#sets up remote control interface to instruct the behavior of web browsers
#basically sets up functionallity to scrape web
driver = webdriver.Chrome(options=options)

#url that changes bassed on the inputed information that is easy to change. 
url = f'https://www.pro-football-reference.com/teams/{lowercaseTeamAbbriev}/{year}.htm'
driver.get(url)

headersArr = []

#reads through page
soup = BeautifulSoup(driver.page_source, 'html.parser')
table = soup.find(id = "games")
#strips *table header* elements
headers = [th.text.strip() for th in table.find_all('th')]
headersArr.append(headers)
#strips all rows for each team
rows = []
boxscore_links = []
for row in table.find_all('tr')[1:]:
    # finds each boxscore link on a teams season page
    boxscore_cell = row.find('td', {'data-stat': 'boxscore_word'})
    
    if boxscore_cell and boxscore_cell.find('a'):
        text = boxscore_cell.find('a').get_text(strip=True).lower()
        link = boxscore_cell.find('a')['href']
        #makes the full link then appends it to list
        if text == "boxscore":
            full_link = f"https://www.pro-football-reference.com{link}"
            boxscore_links.append(full_link)


#Creats the dataframes that will contain a whole season. 
masterInfoDF = pd.DataFrame()
masterStatsDF = pd.DataFrame()

#for each game in the season
for i in range(len(boxscore_links)):
    #Set up driver API to go into each game in the season
    driver.get(boxscore_links[i])
    time.sleep(2)

    #Calls for each games info
    gameInfoDF = getGameInfo("game_info", driver)
    gameStatsDF = getTeamStats("team_stats", driver)
    
    #Combine these into a multindex to store every game by the week it happend for a team.
    gameInfoDF.columns = pd.MultiIndex.from_product([[f"Game{i}"], gameInfoDF.columns])
    gameStatsDF.columns = pd.MultiIndex.from_product([[f"Game{i}"], gameStatsDF.columns])

    #combining dataframes into the master one
    masterInfoDF = pd.concat([masterInfoDF, gameInfoDF], axis=1)
    masterStatsDF = pd.concat([masterStatsDF, gameStatsDF], axis=1)
    print(f"Week {i} is complete")

#Stacking each masterDF such that it is easier to view. 
masterInfoDF = masterInfoDF.stack()
masterStatsDF = masterStatsDF.stack()
 
#saves this to the correct space in the Stats File. 
masterStatsDF.to_csv(f'Stats/{currTeamFileName}/{currTeamAbbriev}TeamStats{year}.csv', index=True)
masterInfoDF.to_csv(f'Stats/{currTeamFileName}/{currTeamAbbriev}GameInfo{year}.csv', index=True)


sfo is away
Week 0 is complete
sfo is away
Week 1 is complete
sfo is away
Week 2 is complete
sfo is away
Week 3 is complete
sfo is away
Week 4 is complete
sfo is away
Week 5 is complete
sfo is away
Week 6 is complete
sfo is away
Week 7 is complete
sfo is away
Week 8 is complete
sfo is away
Week 9 is complete
sfo is away
Week 10 is complete
sfo is away
Week 11 is complete
sfo is away
Week 12 is complete
sfo is away
Week 13 is complete
sfo is away
Week 14 is complete
sfo is away
Week 15 is complete


  masterInfoDF = masterInfoDF.stack()
  masterStatsDF = masterStatsDF.stack()


In [10]:
driver.quit()