In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
class InjuryAggregator:
    """Class that keeps track of injuries across multiple weeks"""
    
    def __init__(self):
        self.keys = ['name','team','year','duration', \
                     'injury','startWeek','endWeek','team_caused', \
                     'position','age','yrs','stadium', 'played',\
                     'P', 'Q', 'D', 'IR', 'O', 'PUP', 'S','Wt','Ht', \
                     'roof','start_time','surface','degrees','humidity_pct']

        self.values = defaultdict(list)
        self.cur_injury = dict.fromkeys(self.keys,0)

        # Tricky - you want to know who they played the game _before_ the injury
        # to see who and where they were injured
        self.last_opponent = "None"
        self.last_stadium = "None"
        self.last_name = "None"
        self.last_year = "None"
        self.last_surface = "None"
        self.last_roof = "None"
        self.last_start_time = "None"
        self.last_degrees = "None"
        self.last_humidity = "None"
        
        
    def _writeout_cur_injury(self):
        """Save values cached in cur_injury to the final dataframe"""
        for key in self.keys:
            self.values[key].append(self.cur_injury[key])
            self.cur_injury[key] = 0
    
    def _checkrow_without_held_injury(self, row):
        """Handle the current row if you're not tracking an injury from before"""
            
        # If there's an injury there, initialize cur_injury
        if row['injury'] is not np.nan:
            self.cur_injury['name'] = row['name']
            self.cur_injury['team'] = row['team']
            self.cur_injury['year'] = row['year']
            self.cur_injury['injury'] = row['injury']
            self.cur_injury['startWeek'] = row['game']
            self.cur_injury['endWeek'] = row['game']
            self.cur_injury['position'] = row['position']
            self.cur_injury['age'] = row['Age']
            self.cur_injury['yrs'] = row['Yrs']
            self.cur_injury['wt'] = row['Wt']
            self.cur_injury['ht'] = row['Ht']

            self.cur_injury['team_caused'] = self.last_opponent
            self.cur_injury['stadium'] = self.last_stadium            
            self.cur_injury['surface'] = self.last_surface
            self.cur_injury['roof'] = self.last_roof
            self.cur_injury['start_time'] = self.last_start_time
            self.cur_injury['degrees'] = self.last_degrees
            self.cur_injury['humidity_pct'] = self.last_humidity
            
            self.cur_injury['duration'] += 1
            self.cur_injury['played'] += (1 if row['played'] == True else 0)
            if row['status'] is not np.nan:
                self.cur_injury[row['status']] += 1
            
    def _checkrow_with_held_injury(self, row):
        """Handle the current row if you're tracking an injury from before"""
        
        # If the same injury, update cur_injury
        if (row['name'] == self.cur_injury['name'] and
            row['year'] == self.cur_injury['year'] and
            row['injury'] == self.cur_injury['injury']):
            self.cur_injury['endWeek'] = row['game']
            self.cur_injury['duration'] += 1
            self.cur_injury['played'] += (1 if row['played'] == True else 0)
            if row['status'] is not np.nan:
                self.cur_injury[row['status']] += 1
        
        # If different injury, write out current injury and clear cur_injury
        # Then, check without holding an injury
        else:
            self._writeout_cur_injury()
            self._checkrow_without_held_injury(row)
    
    def last_update(self):
        """If holding an injury at the end, output it"""
        if (self.cur_injury['injury'] != 0):
            self._writeout_cur_injury()
    
    def update(self, row):
        """Given the current row in the dataframe, update the aggregator"""
        
        # If new player or year, then it's the start of the season
        # and there was no last opponent or stadium
        if (row['year'] != self.last_year) or (row['name'] != self.last_name):
            self.last_opponent = "None"
            self.last_stadium = "None"
            self.last_surface = "None"
            self.last_roof = "None"
            self.last_start_time = "None"
            self.last_degrees = "None"
            self.last_humidity = "None"
            
        # Handle the current row
        if (self.cur_injury['injury'] != 0):
            self._checkrow_with_held_injury(row)
        else:
            self._checkrow_without_held_injury(row)
        
        # Update lasts 
        self.last_year = row['year']
        self.last_name = row['name']
        self.last_opponent = row['opponent']
        self.last_stadium = row['game_url'][-7:-4]        
        self.last_surface = row['surface']
        self.last_roof = row['roof']
        self.last_start_time = row['start_time']
        self.last_degrees = row['degrees']
        self.last_humidity = row['humidity_pct']
    
    def get_df(self):
        """Return the data at the end"""        
        df = pd.DataFrame(self.values)
        cols = ['team','year','name','injury', 'startWeek', 'endWeek', \
                'duration', 'position', 'age', 'yrs', 'Wt','Ht', 'played', \
                'D', 'IR', 'O', 'P', 'PUP', 'Q', 'S', \
                'team_caused', 'stadium', 'surface','roof', \
                'start_time','degrees','humidity_pct']
        
        return df[cols]

def get_injury_level_df(df):
    """Given a dataframe at the player-game level, 
    spit out an injury level dataframe
    """
    ia = InjuryAggregator()
    
    df = df.sort(['team','year','name','game'])
    for index, row in df.iterrows():
        ia.update(row)

    ia.last_update()
    
    return ia.get_df()

In [6]:
df = pd.read_csv("raw_data.csv")
brad_pos = pd.read_csv("brad_positions.csv") #Removed Greg Jones in 2012 from JAX (ILB and FB)
game_df = pd.read_csv("game_info.csv") # Combined game_info_1 and game_info_2

In [7]:
# Remove players who didn't play at least 3 games in the season

MIN_WEEKS_PLAYED = 3

def make_not_DNP(injury, played):
    if injury is np.nan:
        return 1
    elif played is not np.nan and played == True:
        return 1
    else: 
        return 0

df['not_DNP'] = df.apply(lambda row: make_not_DNP(row['injury'], row['played']), axis=1)
games_played = pd.DataFrame({'count' : df.groupby(['name','team','year'])['not_DNP'].aggregate(np.sum)}).reset_index()

df2 = pd.merge(df, games_played, how='left', on=['name','team','year'])
df2 = df2[df2['count'] > MIN_WEEKS_PLAYED]

Unnamed: 0,team,year,game,date,opponent,name,status,injury,played,player_url,...,G,GS,Wt,Ht,College/Univ,BirthDate,Yrs,AV,not_DNP,count
0,crd,2009,1,13-Sep,SFO,Adrian Wilson,,,,/players/W/WilsAd99.htm,...,16,16,222,3-Jun,North Carolina St.,10/12/79,8,13,1,18
1,crd,2009,2,20-Sep,JAX,Adrian Wilson,,,,/players/W/WilsAd99.htm,...,16,16,222,3-Jun,North Carolina St.,10/12/79,8,13,1,18
2,crd,2009,3,27-Sep,IND,Adrian Wilson,,,,/players/W/WilsAd99.htm,...,16,16,222,3-Jun,North Carolina St.,10/12/79,8,13,1,18
3,crd,2009,4,11-Oct,HOU,Adrian Wilson,,,,/players/W/WilsAd99.htm,...,16,16,222,3-Jun,North Carolina St.,10/12/79,8,13,1,18
4,crd,2009,5,18-Oct,SEA,Adrian Wilson,,,,/players/W/WilsAd99.htm,...,16,16,222,3-Jun,North Carolina St.,10/12/79,8,13,1,18


In [13]:
stripped = games_played[games_played['count'] <= 3]
stripped.to_csv("stripped.csv")

In [6]:
# Merge in player positions
brad_pos['team']
team_dict ={
    'ATL': 'atl','BUF': 'buf','CAR': 'car','CHI': 'chi',
    'CIN': 'cin','CLE': 'cle','IND': 'clt','ARI': 'crd',
    'DAL': 'dal','DEN': 'den','DET': 'det','GB': 'gnb',
    'HOU': 'htx','JAC': 'jax','KC': 'kan','MIA': 'mia',
    'MIN': 'min','NO': 'nor','NE': 'nwe','NYG': 'nyg',
    'NYJ': 'nyj','TEN': 'oti','PHI': 'phi','PIT': 'pit',
    'OAK': 'rai','STL': 'ram','BAL': 'rav','SD': 'sdg',
    'SEA': 'sea','SF': 'sfo','TB': 'tam','WAS': 'was'
}

brad_pos['team'] = brad_pos.apply(lambda row: team_dict[row['team']], axis=1)

df3 = pd.merge(df2, brad_pos, how='left', on=['name','year','team'])

In [7]:
# Merge in game data
df4 = pd.merge(df3, game_df, how='left', on="game_url")

df4['degrees'] = df4.weather.str.extract('^(\d*)')
df4['humidity_pct'] = df4.weather.str.extract('humidity (\d*)')

In [8]:
# Limit number of injury types
MAX_INJURY_TYPES = 32

df4['injury'] = df4.injury.str.lower()
injuries = pd.DataFrame(df4.injury.value_counts()[0:MAX_INJURY_TYPES])
top32 = list(injuries.index)

def clean_injury(injury):
    if injury is np.nan:
        return np.nan
    else:
        if injury in top32:
            return injury
        else:
            return 'other'

df4['injury'] = df4.apply(lambda row: clean_injury(row['injury']), axis=1)

In [12]:
# Save as week level dataset

df4.to_csv("raw_data_clean.csv")

# Convert to injury level dataset

il_df = get_injury_level_df(df4)
il_df.to_csv("injuries_unique_jkg_clean.csv")

In [13]:
# dx = df4.reset_index(drop=True)
# df_gpby = dx.groupby(list(dx.columns))
# idx = [x[0] for x in df_gpby.groups.values() if len(x) > 1]
# dx.loc[idx,:].sort()

# pd.DataFrame(pd.Series(df3.game_url).unique()).to_csv("game_urls.csv")

# sum(injuries.ix[:,0])
# df4.shape