# scraper for getting football game data
- Sadly and surprisingly there aren't any free APIs or public data set files for this.
- All I'm interested in are end of game stats (win/loss, score, date).

@Author: [Jeff Lockhart](http://www-personal.umich.edu/~jwlock/)

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time

Example URL: 
https://www.sports-reference.com/cfb/schools/michigan/2017-schedule.html

In [None]:
def get_table(page):
    #parse page with bs4
    soup = BeautifulSoup(page, 'html.parser')
    #select just the table of interest
    table = soup.find(id='schedule')

    data = []
    rows = table.find_all('tr')
    #each row is a game
    for r in rows:
        tmp = {}
        for d in r.find_all('td'):
            #thankfully each cell in the table has a label
            tmp[d['data-stat']] = d.text
        data.append(tmp)
    #convert our findings to a dataframe
    df = pd.DataFrame(data)
    #drop the pesky null rows
    df = df.dropna(axis=0, how='all')
    #get a real date object
    df['date'] = pd.to_datetime(df.date_game)
    #drop these mostly empty and useless columns if they exist
    df.drop(columns=['broadcaster', 'date_game', 'day_name'], 
            inplace=True, errors='ignore')
    
    return df

In [None]:
df = pd.DataFrame()

years = range(1880, 2019)
base_url = 'https://www.sports-reference.com/cfb/schools/michigan/'
end_url = '-schedule.html'

for y in years:
    #get the page for this year
    url = base_url+str(y)+end_url
    r = requests.get(url)
    
    #if the page exists
    if r.status_code == 200:
        print('Processing', y)
        tmp = get_table(r.content)
        df = pd.concat([df, tmp])
    else:
        #some years don't have data. Ignore them and move on.
        print('Error with', y)
        
    #wait to be a polite lil spider
    time.sleep(2)
    
df.shape

In [None]:
#sort our data and peak at it.
df = df.sort_values(by='date')
df.head()

In [None]:
#save the data for future use.
df.to_csv('../data/UM_football.tsv', sep='\t', index=False)

In [None]:
#fun global stats 
df.game_result.value_counts()