In [1]:
import numpy as np
import pandas as pd
import python_scripts.football_utility_functions as file_utils
import python_scripts.utils.data_parsing_defense as parsing_utils_defense


def read_defensive_play_by_play_data(year):
    # https://github.com/ryurko/nflscrapR-data/tree/master/legacy_data
    play_by_play_data = file_utils.read_csv('../data_gitignore/reg_pbp_' + str(year) + '.csv')
    player_statistics = pd.DataFrame(columns=parsing_utils_defense.defense_columns)
    skipped_rows = 0

    for index, play in play_by_play_data.iterrows():
        # Check whether current play wasn't reversed or flagged
        if (play['timeout'] != 0 and not pd.isna(play['timeout'])) or (
                play['penalty'] != 0 and not pd.isna(play['penalty'])) or (
                play['replay_or_challenge'] != 0 and not pd.isna(play['replay_or_challenge'])):
            skipped_rows += 1
            continue  # Skip row entirely        continue

        current_row = pd.DataFrame(columns=parsing_utils_defense.defense_columns)
        # CASE INTERCEPTION
        if play['interception'] != 0 and not pd.isna(play['interception']):
            current_row = parsing_utils_defense.handle_interceptions(play, player_statistics, current_row)
        # CASE Tackle for loss # TODO too few
        if play['tackled_for_loss'] != 0 and not pd.isna(play['tackled_for_loss']):
            current_row = parsing_utils_defense.handle_tackles_for_loss(play, player_statistics, current_row)
        # CASE Tackles solo
        if play['solo_tackle'] != 0 and not pd.isna(play['solo_tackle']):
            current_row = parsing_utils_defense.handle_solo_tackles(play, player_statistics, current_row)
        # CASE Tackle assists
        if play['assist_tackle'] != 0 and not pd.isna(play['assist_tackle']):
            current_row = parsing_utils_defense.handle_assisted_tackles(play, player_statistics, current_row)
        # CASE Fumble forced
        if play['fumble_forced'] != 0 and not pd.isna(play['fumble_forced']):
            current_row = parsing_utils_defense.handle_forced_fumbles(play, player_statistics, current_row)
        # CASE Sack
        if play['sack'] != 0 and not pd.isna(play['sack']):
            current_row = parsing_utils_defense.handle_sacks(play, player_statistics, current_row)
        # CASE QB Hit
        if play['qb_hit'] != 0 and not pd.isna(play['qb_hit']):
            current_row = parsing_utils_defense.handle_qb_hits(play, player_statistics, current_row)
        # Incomplete passes against defender
        if play['incomplete_pass'] != 0 and not pd.isna(play['incomplete_pass']):
            current_row = parsing_utils_defense.handle_incomplete_passes(play, player_statistics, current_row)
        # Fumble Recovery (fumble happened and was lost by the offensive team)
        if play['fumble'] != 0 and not pd.isna(play['fumble']) and play['fumble_lost'] != 0 and not pd.isna(
                play['fumble_lost']):
            current_row = parsing_utils_defense.handle_fumble_recoveries(play, current_row)

        # Clean statistics for the current row and add it into overall player statistics
        for _, stat in current_row.iterrows():
            stat = parsing_utils_defense.sanitize_stat_row(stat)
            player_statistics = parsing_utils_defense.add_row_to_stats(player_statistics, stat)

    player_statistics['total_tackles'] = player_statistics['solo_tackles'] + player_statistics['tackles_assist']
    return player_statistics
    #%%
    #MAYBE Return yards on turnovers
    #LATER total tackles

    #TODO get rid of double solo/assisted tackles, etc.

In [2]:
def save_defensive_data_to_files():
    for year in range(2009, 2020):  # years from 2009 - 2020 (excl.)
        yearly_stats = read_defensive_play_by_play_data(year)
        yearly_stats.to_csv('../data_gitignore/yearly_def_stats/def_stats_' + str(year) + '.csv')

In [3]:
def read_defensive_data_from_files():
    all_stats = pd.DataFrame()
    for year in range(2009, 2020):
        yearly_stats = file_utils.read_csv('../data_gitignore/yearly_def_stats/def_stats_' + str(year) + '.csv')
        yearly_stats['season_helper'] = 1
        all_stats = all_stats.append(yearly_stats)

    all_stats = all_stats.iloc[:, 1:].groupby(['player_id', 'player_name'], as_index=False).sum()
    return all_stats


all_stats = read_defensive_data_from_files()

In [4]:
def assign_position_from_id(row, rosters):
    position = rosters.loc[rosters['gsis_id'] == row['player_id'], 'position']
    if len(position) > 0:
        return position.iloc[0]
    else:
        return np.nan

In [5]:
rosters = pd.read_csv('../data_gitignore/nfl_rosters_2009_2019.csv')
rosters = rosters.drop_duplicates(subset='gsis_id').iloc[:, 1:]

all_stats['position'] = np.nan
all_stats['position'] = all_stats.apply(lambda x: assign_position_from_id(x, rosters), axis=1)

In [6]:
all_stats_with_positions = file_utils.normalize_positions_of_players(all_stats, 'position').dropna(subset=['unit_key'],
                                                                                                   inplace=False)
all_stats_with_positions.to_csv('../unprocessed_nfl_data/statistics/defensive_statistics.csv')