In [1]:
import urllib
from bs4 import BeautifulSoup
import pandas as pd
import csv
from random import randint
from time import sleep
import lxml
import re

def getInjuries(team, year):
    """
    Scrapes one page of injury information
    """

    # Scrape injury information
    content = urllib.urlopen("http://www.pro-football-reference.com/teams/" + team + "/" + year + "_injuries.htm")
    s = content.read()
#     s = open("injuries.html")

    soup = BeautifulSoup(s)

    table = soup.find('table')
    # print table.prettify()

    # Scrape headers into their own table
    opponent = []
    game = []
    date = []
    game_url = []
    for num, heading in enumerate(table.findAll("th")):
    
        this_url = heading.find_all("a", href=True)
        if (this_url) != []:
            game_url.append(this_url[0].get('href'))
        else:
            continue
        
        head = heading.get_text()
        game.append(num)
        date.append(head[0:5])
        opponent.append(head[9:12])

    headers = pd.DataFrame({'opponent':opponent, 'game':game, 'date':date, 'game_url':game_url})
    
    # Scrape body of table
    name = []
    player_url = []
    status = []
    played = []
    injury = []
    game = []

    for row in table.findAll("tr"):
        cells = row.findAll("td")
        if (len(cells) > 0):
            for i in range(1,len(cells)):
                # Name
                name.append(cells[0].get_text())
                player_url.append(cells[0].a.get('href'))
                
                # Game
                game.append(i)
                
                # Status
                status.append(cells[i].get_text())
                
                # Played
                if (cells[i].attrs.get('class') is not None):
                    played.append(not('played' in cells[i].attrs.get('class')))
                else:
                    played.append(None)

                # Injury detail
                if cells[i].find("span") != None:
                    inj_type = str.split(cells[i].find("span").attrs.get('tip').encode('utf-8'),": ")[1]
                    injury.append(inj_type)
                else:
                    injury.append(None)

    body = pd.DataFrame({'name':name,'status':status,'injury':injury,'played':played,'game':game,'year':year,'team':team,
                         'player_url':player_url})

    # Merge together dataframes
    both = pd.merge(headers, body, on = 'game')
    cols = ['team', 'year', 'game', 'date', 'opponent', 'name', 'status', 'injury', 'played','player_url','game_url']
    
    both = both[cols].sort(columns=['year','team','name','game'])
    
    # Scrape roster information
    content = urllib.urlopen("http://www.pro-football-reference.com/teams/" + team + "/" + year + "_roster.htm")
    s = content.read()
    soup = BeautifulSoup(s)

    table = soup.find_all('table')[1]
    roster = pd.read_html(str(table))[0]
    
    new_columns = roster.columns.values
    new_columns[1] = 'name'
    roster.columns = new_columns
    
    roster['name'] = [re.sub('[*]|[+]', '', name) for name in roster['name']]

    # Merge and return
    final = pd.merge(both, roster, how = "left", on="name")
    return final

In [5]:
## Get all pages to scrape
teams = ["crd", "atl", "rav", "buf","car","chi","cin","cle","dal","den","det","gnb","htx","clt","jax","kan","mia","min",
           "nwe","nor","nyg","nyj","rai","phi","pit","sdg","sfo","sea","ram","tam","oti","was"]
years = ['2009','2010','2011','2012','2013','2014']
# years1 = ['2009','2010','2011']
# years2 = ['2012','2013','2014']
team_years = [(x,y) for x in teams for y in years]

## Scrape each of the pages
output = pd.DataFrame()
for a,b in team_years:
    print a, b
    sleep(randint(1,3))
    sleep(randint(1,3))    
    output = output.append(getInjuries(a,b))

# Having some problems with drafted column, so I've taken that out
output = output.drop('Drafted (tm/rnd/yr)',1)

output.to_csv("output.csv", index = False)

crd 2012
crd 2013
crd 2014
atl 2012
atl 2013
atl 2014
rav 2012
rav 2013
rav 2014
buf 2012
buf 2013
buf 2014
car 2012
car 2013
car 2014
chi 2012
chi 2013
chi 2014
cin 2012
cin 2013
cin 2014
cle 2012
cle 2013
cle 2014
dal 2012
dal 2013
dal 2014
den 2012
den 2013
den 2014
det 2012
det 2013
det 2014
gnb 2012
gnb 2013
gnb 2014
htx 2012
htx 2013
htx 2014
clt 2012
clt 2013
clt 2014
jax 2012
jax 2013
jax 2014
kan 2012
kan 2013
kan 2014
mia 2012
mia 2013
mia 2014
min 2012
min 2013
min 2014
nwe 2012
nwe 2013
nwe 2014
nor 2012
nor 2013
nor 2014
nyg 2012
nyg 2013
nyg 2014
nyj 2012
nyj 2013
nyj 2014
rai 2012
rai 2013
rai 2014
phi 2012
phi 2013
phi 2014
pit 2012
pit 2013
pit 2014
sdg 2012
sdg 2013
sdg 2014
sfo 2012
sfo 2013
sfo 2014
sea 2012
sea 2013
sea 2014
ram 2012
ram 2013
ram 2014
tam 2012
tam 2013
tam 2014
oti 2012
oti 2013
oti 2014
was 2012
was 2013
was 2014
