In [21]:
import pandas as pd
import numpy as np
import re

In [22]:
# Creamos diccionario de los equipos de la NFL
nfl_teams = pd.read_csv('Datos/nfl_teams.csv')
names = dict(zip(nfl_teams["team_name"], nfl_teams["team_id"]))

# Prueba
names["Jacksonville Jaguars"]

'JAX'

In [23]:
# Filtramos los datos por los años con mayor disponibilidad de los datos
nfl_scores = pd.read_csv('Datos/spreadspoke_scores.csv', encoding='utf-8')
nfl_scores = nfl_scores[nfl_scores["schedule_season"] >= 1999]
nfl_scores = nfl_scores.reset_index()
nfl_scores.head()

Unnamed: 0,index,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail
0,7095,9/12/1999,1999,1,False,Atlanta Falcons,14,17,Minnesota Vikings,MIN,-4.0,49.0,Georgia Dome,False,72.0,0.0,,indoor
1,7096,9/12/1999,1999,1,False,Chicago Bears,20,17,Kansas City Chiefs,KC,-3.0,38.0,Soldier Field,False,68.0,10.0,69.0,
2,7097,9/12/1999,1999,1,False,Cleveland Browns,0,43,Pittsburgh Steelers,PIT,-6.0,37.0,FirstEnergy Stadium,False,70.0,10.0,54.0,
3,7098,9/12/1999,1999,1,False,Green Bay Packers,28,24,Oakland Raiders,GB,-9.0,43.0,Lambeau Field,False,61.0,9.0,84.0,
4,7099,9/12/1999,1999,1,False,Indianapolis Colts,31,14,Buffalo Bills,BUF,-3.0,45.5,RCA Dome,False,72.0,0.0,,indoor


In [24]:
def sum_minsec(times):
    total = 0
    for time in times:
        m, s = map(int, time.split(":"))
        total += 60*m + s
    return total

In [25]:
# Número de partidos a completar los datos
n_games = nfl_scores.shape[0]
print(n_games)

# Inicializamos arreglos para llenar datos
td_home = np.zeros(n_games)
td_away = np.zeros(n_games)
extra_home = np.zeros(n_games)
extra_away = np.zeros(n_games)
conv_home = np.zeros(n_games)
conv_away = np.zeros(n_games)
goal_home = np.zeros(n_games)
goal_away = np.zeros(n_games)
safety_home = np.zeros(n_games)
safety_away = np.zeros(n_games)
avgyd_home = np.zeros(n_games)
avgyd_away = np.zeros(n_games)
effdown_home = np.zeros(n_games)
effdown_away = np.zeros(n_games)
penalized_home = np.zeros(n_games)
penalized_away = np.zeros(n_games)
timepos_home = np.zeros(n_games)
timepos_away = np.zeros(n_games)
drives =  np.zeros(n_games)
overtime = np.zeros(n_games)

6421


In [26]:
pre_url = 'https://github.com/nflverse/nflverse-data/releases/download/pbp/play_by_play_'
end_url = '.csv'
urls = []

for i in range(1999, 2023):
    urls.append( pre_url + str(i) + end_url)

In [27]:
for url in urls:
    year = re.findall('[0-9]+', url)[0]
    
    plays_data = pd.read_csv(url, encoding='utf-8', low_memory=False)

    # Obtenemos sólo las columnas que necesitamos
    plays_data = plays_data[["game_date", "season", "home_team", "touchdown", "posteam", "extra_point_result",
     "two_point_conv_result", "safety", "field_goal_result", "penalty_team", "penalty",
     "drive", "drive_first_downs", "drive_play_count", "drive_time_of_possession",
     "ydsnet", "qtr"]]
    plays_data['game_date'] =  pd.to_datetime(plays_data['game_date'])

    i = 0
    for row in nfl_scores[nfl_scores["schedule_season"] == int(year)].to_numpy():
        team_home = names[row[5]]
        team_away = names[row[8]]
        plays = plays_data[(plays_data["game_date"].dt.strftime('%#m/%#d/%Y') == row[1]) &
                          (plays_data["home_team"] == team_home)]

        # Touchdowns
        td_home[i] = len(plays[(plays["touchdown"] == 1) & (plays["posteam"] == team_home)])
        td_away[i] = len(plays[(plays["touchdown"] == 1) & (plays["posteam"] == team_away)])
        
        # Punto extra
        extra_home[i] = len(plays[(plays["extra_point_result"] == "good") & (plays["posteam"] == team_home)])
        extra_away[i] = len(plays[(plays["extra_point_result"] == "good") & (plays["posteam"] == team_away)])

        # Conversiones
        conv_home[i] = len(plays[(plays["two_point_conv_result"] == "success") & (plays["posteam"] == team_home)])
        conv_away[i] = len(plays[(plays["two_point_conv_result"] == "success") & (plays["posteam"] == team_away)])

        # Goles de campo
        goal_home[i] = len(plays[(plays["field_goal_result"] == "made") & (plays["posteam"] == team_home)])
        goal_away[i] = len(plays[(plays["field_goal_result"] == "made") & (plays["posteam"] == team_away)])
        
        # Safety
        safety_home[i] = len(plays[(plays["safety"] == 1) & (plays["posteam"] == team_away)])
        safety_away[i] = len(plays[(plays["safety"] == 1) & (plays["posteam"] == team_home)])
        
        # Penalizaciones
        penalized_home[i] = len(plays[(plays["penalty_team"] == team_home) & (plays["penalty"] == 1)])
        penalized_away[i] = len(plays[(plays["penalty_team"] == team_away) & (plays["penalty"] == 1)])

        # Tiempo Extra
        overtime[i] = int(plays["qtr"].max() == 5)

        # Número de series
        drives[i] = plays["drive"].max()

        last_rows = plays.drop_duplicates(
            subset = ["drive"])
        
        drives_home = last_rows[(last_rows["posteam"] == team_home) & (last_rows["drive_play_count"] > 0)]
        drives_away = last_rows[(last_rows["posteam"] == team_away) & (last_rows["drive_play_count"] > 0)]

        if len(drives_home) != 0 or len(drives_away) != 0:
            # Probabilidad de que se convierta en primero y 10
            effdown_home[i] = drives_home["drive_first_downs"].sum() / drives_home["drive_play_count"].sum()
            effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()

            # Porcentaje de posesión
            sec_home = sum_minsec(drives_home["drive_time_of_possession"].values)
            sec_away = sum_minsec(drives_away["drive_time_of_possession"].values)
            
            timepos_home[i] = sec_home/(sec_home + sec_away)
            timepos_away[i] = sec_away/(sec_home + sec_away)

            # Promedio de yardas avanzadas por jugada
            avgyd_home[i] = np.nanmean(drives_home["ydsnet"].values / drives_home["drive_play_count"].values)
            avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


        if i % 100 == 0:
            print(i, i/n_games, '%')
        i += 1

0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %
100 0.01557389814670612 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %
100 0.01557389814670612 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %
0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %


In [28]:
columns = ["td_home","td_away","extra_home","extra_away","conv_home","conv_away",
           "goal_home","goal_away","safety_home","safety_away","avgyd_home",
           "avgyd_away","effdown_home","effdown_away","penalized_home","penalized_away",
           "timepos_home","timepos_away","drives","overtime"]
new_data = np.matrix([
    td_home, td_away, extra_home, extra_away, conv_home, conv_away, goal_home, 
    goal_away, safety_home, safety_away, avgyd_home, avgyd_away, effdown_home, 
    effdown_away, penalized_home, penalized_away, timepos_home, timepos_away, 
    drives , overtime 
]).T
nfl_scores_new = nfl_scores.values
new_data = np.hstack((nfl_scores_new, new_data))

new_data_nfl = pd.DataFrame(new_data, columns= (nfl_scores.columns.to_list()+columns))
new_data_nfl

Unnamed: 0,index,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,...,avgyd_home,avgyd_away,effdown_home,effdown_away,penalized_home,penalized_away,timepos_home,timepos_away,drives,overtime
0,7095,9/12/1999,1999,1,False,Atlanta Falcons,14,17,Minnesota Vikings,MIN,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
1,7096,9/12/1999,1999,1,False,Chicago Bears,20,17,Kansas City Chiefs,KC,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
2,7097,9/12/1999,1999,1,False,Cleveland Browns,0,43,Pittsburgh Steelers,PIT,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
3,7098,9/12/1999,1999,1,False,Green Bay Packers,28,24,Oakland Raiders,GB,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
4,7099,9/12/1999,1999,1,False,Indianapolis Colts,31,14,Buffalo Bills,BUF,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6416,13511,1/22/2023,2022,Division,True,Buffalo Bills,10,27,Cincinnati Bengals,BUF,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6417,13512,1/22/2023,2022,Division,True,San Francisco 49ers,19,12,Dallas Cowboys,SF,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6418,13513,1/29/2023,2022,Conference,True,Kansas City Chiefs,23,20,Cincinnati Bengals,KC,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6419,13514,1/29/2023,2022,Conference,True,Philadelphia Eagles,31,7,San Francisco 49ers,PHI,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
new_data_nfl.to_csv("Datos/nfl_scores.csv")