In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# Creamos diccionario de los equipos de la NFL
nfl_teams = pd.read_csv('Datos/nfl_teams.csv')
names = dict(zip(nfl_teams["team_name"], nfl_teams["team_id"]))

# Prueba
names["Jacksonville Jaguars"]

'JAX'

In [3]:
# Filtramos los datos por los años con mayor disponibilidad de los datos
nfl_scores = pd.read_csv('Datos/spreadspoke_scores.csv', encoding='utf-8')
nfl_scores = nfl_scores[nfl_scores["schedule_season"] >= 1999]
nfl_scores = nfl_scores.reset_index()
nfl_scores.head()

Unnamed: 0,index,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail
0,7095,9/12/1999,1999,1,False,Atlanta Falcons,14,17,Minnesota Vikings,MIN,-4.0,49.0,Georgia Dome,False,72.0,0.0,,indoor
1,7096,9/12/1999,1999,1,False,Chicago Bears,20,17,Kansas City Chiefs,KC,-3.0,38.0,Soldier Field,False,68.0,10.0,69.0,
2,7097,9/12/1999,1999,1,False,Cleveland Browns,0,43,Pittsburgh Steelers,PIT,-6.0,37.0,FirstEnergy Stadium,False,70.0,10.0,54.0,
3,7098,9/12/1999,1999,1,False,Green Bay Packers,28,24,Oakland Raiders,GB,-9.0,43.0,Lambeau Field,False,61.0,9.0,84.0,
4,7099,9/12/1999,1999,1,False,Indianapolis Colts,31,14,Buffalo Bills,BUF,-3.0,45.5,RCA Dome,False,72.0,0.0,,indoor


In [4]:
def sum_minsec(times):
    total = 0
    for time in times:
        m, s = map(int, time.split(":"))
        total += 60*m + s
    return total

In [5]:
# Número de partidos a completar los datos
n_games = nfl_scores.shape[0]
print(n_games)

# Inicializamos arreglos para llenar datos
td_home = np.zeros(n_games)
td_away = np.zeros(n_games)
extra_home = np.zeros(n_games)
extra_away = np.zeros(n_games)
conv_home = np.zeros(n_games)
conv_away = np.zeros(n_games)
goal_home = np.zeros(n_games)
goal_away = np.zeros(n_games)
safety_home = np.zeros(n_games)
safety_away = np.zeros(n_games)
avgyd_home = np.zeros(n_games)
avgyd_away = np.zeros(n_games)
effdown_home = np.zeros(n_games)
effdown_away = np.zeros(n_games)
penalized_home = np.zeros(n_games)
penalized_away = np.zeros(n_games)
timepos_home = np.zeros(n_games)
timepos_away = np.zeros(n_games)
drives =  np.zeros(n_games)
overtime = np.zeros(n_games)

6421


In [6]:
pre_url = 'https://github.com/nflverse/nflverse-data/releases/download/pbp/play_by_play_'
end_url = '.csv'
urls = []

for i in range(1999, 2023):
    urls.append( pre_url + str(i) + end_url)

In [7]:
i = 0
for url in urls:
    year = re.findall('[0-9]+', url)[0]
    
    plays_data = pd.read_csv(url, encoding='utf-8', low_memory=False)

    # Obtenemos sólo las columnas que necesitamos
    plays_data = plays_data[["game_date", "season", "home_team", "touchdown", "posteam", "extra_point_result",
     "two_point_conv_result", "safety", "field_goal_result", "penalty_team", "penalty",
     "drive", "drive_first_downs", "drive_play_count", "drive_time_of_possession",
     "ydsnet", "qtr"]]
    plays_data['game_date'] =  pd.to_datetime(plays_data['game_date'])

    for row in nfl_scores[nfl_scores["schedule_season"] == int(year)].to_numpy():
        team_home = names[row[5]]
        team_away = names[row[8]]
        plays = plays_data[(plays_data["game_date"].dt.strftime('%#m/%#d/%Y') == row[1]) &
                          (plays_data["home_team"] == team_home)]

        # Touchdowns
        td_home[i] = len(plays[(plays["touchdown"] == 1) & (plays["posteam"] == team_home)])
        td_away[i] = len(plays[(plays["touchdown"] == 1) & (plays["posteam"] == team_away)])
        
        # Punto extra
        extra_home[i] = len(plays[(plays["extra_point_result"] == "good") & (plays["posteam"] == team_home)])
        extra_away[i] = len(plays[(plays["extra_point_result"] == "good") & (plays["posteam"] == team_away)])

        # Conversiones
        conv_home[i] = len(plays[(plays["two_point_conv_result"] == "success") & (plays["posteam"] == team_home)])
        conv_away[i] = len(plays[(plays["two_point_conv_result"] == "success") & (plays["posteam"] == team_away)])

        # Goles de campo
        goal_home[i] = len(plays[(plays["field_goal_result"] == "made") & (plays["posteam"] == team_home)])
        goal_away[i] = len(plays[(plays["field_goal_result"] == "made") & (plays["posteam"] == team_away)])
        
        # Safety
        safety_home[i] = len(plays[(plays["safety"] == 1) & (plays["posteam"] == team_away)])
        safety_away[i] = len(plays[(plays["safety"] == 1) & (plays["posteam"] == team_home)])
        
        # Penalizaciones
        penalized_home[i] = len(plays[(plays["penalty_team"] == team_home) & (plays["penalty"] == 1)])
        penalized_away[i] = len(plays[(plays["penalty_team"] == team_away) & (plays["penalty"] == 1)])

        # Tiempo Extra
        overtime[i] = int(plays["qtr"].max() == 5)

        # Número de series
        drives[i] = plays["drive"].max()

        last_rows = plays.drop_duplicates(
            subset = ["drive"])
        
        drives_home = last_rows[(last_rows["posteam"] == team_home) & (last_rows["drive_play_count"] > 0)]
        drives_away = last_rows[(last_rows["posteam"] == team_away) & (last_rows["drive_play_count"] > 0)]

        if len(drives_home) != 0 or len(drives_away) != 0:
            # Probabilidad de que se convierta en primero y 10
            effdown_home[i] = drives_home["drive_first_downs"].sum() / drives_home["drive_play_count"].sum()
            effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()

            # Porcentaje de posesión
            sec_home = sum_minsec(drives_home["drive_time_of_possession"].values)
            sec_away = sum_minsec(drives_away["drive_time_of_possession"].values)
            
            timepos_home[i] = sec_home/(sec_home + sec_away)
            timepos_away[i] = sec_away/(sec_home + sec_away)

            # Promedio de yardas avanzadas por jugada
            avgyd_home[i] = np.nanmean(drives_home["ydsnet"].values / drives_home["drive_play_count"].values)
            avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


        if i % 100 == 0:
            print(i, i/n_games, '%')
        i += 1

0 0.0 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


100 0.01557389814670612 %
200 0.03114779629341224 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


300 0.04672169444011836 %
400 0.06229559258682448 %
500 0.0778694907335306 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


600 0.09344338888023672 %
700 0.10901728702694284 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


800 0.12459118517364896 %
900 0.14016508332035507 %
1000 0.1557389814670612 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


1100 0.17131287961376732 %
1200 0.18688677776047344 %
1300 0.20246067590717956 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


1400 0.21803457405388568 %
1500 0.2336084722005918 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


1600 0.24918237034729793 %
1700 0.2647562684940041 %
1800 0.28033016664071014 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


1900 0.29590406478741627 %
2000 0.3114779629341224 %
2100 0.3270518610808285 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


2200 0.34262575922753463 %
2300 0.35819965737424075 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


2400 0.3737735555209469 %
2500 0.389347453667653 %
2600 0.4049213518143591 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


2700 0.42049524996106524 %
2800 0.43606914810777136 %
2900 0.4516430462544775 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


3000 0.4672169444011836 %
3100 0.48279084254788973 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


3200 0.49836474069459585 %
3300 0.513938638841302 %
3400 0.5295125369880082 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


3500 0.5450864351347142 %
3600 0.5606603332814203 %
3700 0.5762342314281265 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


3800 0.5918081295748325 %
3900 0.6073820277215387 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


4000 0.6229559258682448 %
4100 0.638529824014951 %
4200 0.654103722161657 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


4300 0.6696776203083632 %
4400 0.6852515184550693 %
4500 0.7008254166017754 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


4600 0.7163993147484815 %
4700 0.7319732128951877 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


4800 0.7475471110418938 %
4900 0.7631210091885999 %
5000 0.778694907335306 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


5100 0.7942688054820122 %
5200 0.8098427036287182 %
5300 0.8254166017754244 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


5400 0.8409904999221305 %
5500 0.8565643980688367 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


5600 0.8721382962155427 %
5700 0.8877121943622489 %
5800 0.903286092508955 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


5900 0.9188599906556612 %
6000 0.9344338888023672 %
6100 0.9500077869490734 %


  effdown_away[i] = drives_away["drive_first_downs"].sum() / drives_away["drive_play_count"].sum()
  avgyd_away[i] = np.nanmean(drives_away["ydsnet"].values / drives_away["drive_play_count"].values)


6200 0.9655816850957795 %
6300 0.9811555832424856 %
6400 0.9967294813891917 %


In [8]:
columns = ["td_home","td_away","extra_home","extra_away","conv_home","conv_away",
           "goal_home","goal_away","safety_home","safety_away","avgyd_home",
           "avgyd_away","effdown_home","effdown_away","penalized_home","penalized_away",
           "timepos_home","timepos_away","drives","overtime"]
new_data = np.matrix([
    td_home, td_away, extra_home, extra_away, conv_home, conv_away, goal_home, 
    goal_away, safety_home, safety_away, avgyd_home, avgyd_away, effdown_home, 
    effdown_away, penalized_home, penalized_away, timepos_home, timepos_away, 
    drives , overtime 
]).T
nfl_scores_new = nfl_scores.values
new_data = np.hstack((nfl_scores_new, new_data))

new_data_nfl = pd.DataFrame(new_data, columns= (nfl_scores.columns.to_list()+columns))
new_data_nfl

Unnamed: 0,index,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,...,avgyd_home,avgyd_away,effdown_home,effdown_away,penalized_home,penalized_away,timepos_home,timepos_away,drives,overtime
0,7095,9/12/1999,1999,1,False,Atlanta Falcons,14,17,Minnesota Vikings,MIN,...,9.396693,6.481902,0.37931,0.279412,4.0,9.0,0.450636,0.549364,20.0,0.0
1,7096,9/12/1999,1999,1,False,Chicago Bears,20,17,Kansas City Chiefs,KC,...,1.339015,2.119453,0.301887,0.268657,6.0,7.0,0.45683,0.54317,23.0,0.0
2,7097,9/12/1999,1999,1,False,Cleveland Browns,0,43,Pittsburgh Steelers,PIT,...,0.746667,5.567066,0.071429,0.365591,4.0,4.0,0.200167,0.799833,22.0,0.0
3,7098,9/12/1999,1999,1,False,Green Bay Packers,28,24,Oakland Raiders,GB,...,3.246723,,0.357143,,7.0,0.0,1.0,0.0,25.0,0.0
4,7099,9/12/1999,1999,1,False,Indianapolis Colts,31,14,Buffalo Bills,BUF,...,6.366402,3.13475,0.293103,0.283333,7.0,13.0,0.549321,0.450679,24.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6416,13511,1/22/2023,2022,Division,True,Buffalo Bills,10,27,Cincinnati Bengals,BUF,...,4.259077,5.647663,0.285714,0.410959,8.0,2.0,0.435,0.565,17.0,0.0
6417,13512,1/22/2023,2022,Division,True,San Francisco 49ers,19,12,Dallas Cowboys,SF,...,4.140659,4.113333,0.313433,0.241935,3.0,7.0,0.548056,0.451944,19.0,0.0
6418,13513,1/29/2023,2022,Conference,True,Kansas City Chiefs,23,20,Cincinnati Bengals,KC,...,4.577976,3.080568,0.333333,0.276923,4.0,9.0,0.547957,0.452043,21.0,0.0
6419,13514,1/29/2023,2022,Conference,True,Philadelphia Eagles,31,7,San Francisco 49ers,PHI,...,3.717381,3.07,0.352113,0.244444,4.0,11.0,0.623889,0.376111,20.0,0.0


In [9]:
new_data_nfl.to_csv("Datos/nfl_scores.csv")