# Best X finder - Construction

Lo que haremos es construir el modelo descrito en el artículo. Sin embargo, el objetivo final, es diseñar una función que encuentre la variable independiente, $X$, con mayor significancia estadística para la variable dependiente $Y$. Este algoritmo funciona independientemente de si se trata un agente libre o no.

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from matplotlib.colors import ListedColormap
from termcolor import colored
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import pandas as pd
import numpy as np
import array
import math
import os
import warnings
print('Modulos importados')

Modulos importados


In [2]:
# Configuraciones
warnings.filterwarnings('ignore')
# Reduzcamos el número de línea a leer
pd.options.display.max_rows = 15

In [3]:
# Veamos el directorio actual de trabajo
print(os.getcwd())
# El directorio anterior es el correcto, pero si no lo fuese, hacemos lo sigueinte:
path = '/home/usuario/Documentos/Github/Proyectos/MLB_HN'
os.chdir(path)

/home/usuario/Documentos/Github/Proyectos/MLB_HN/Models/Linear_models/Free_agent/Best_X_finder


In [4]:
print("Para modificar el tamaño de todos los gráficos")
from matplotlib import rcParams
rcParams['figure.figsize'] = 15,9

Para modificar el tamaño de todos los gráficos


In [5]:
# Veamos el directorio actual de trabajo
print(os.getcwd())
# El directorio anterior es el correcto, pero si no lo fuese, hacemos lo sigueinte:
path = '/home/usuario/Documentos/Github/Proyectos/MLB_HN'
print("Nuevo directorio de trabajo: " + str(os.chdir(path)))

/home/usuario/Documentos/Github/Proyectos/MLB_HN
Nuevo directorio de trabajo: None


## Exploración de datos

Estudiaremos de manera general si hay datos faltantes, así como el tipo de datos que contiene

In [6]:
# Años de análisis
starting_year = 2012
period = 11
csv = '.csv'
# Directorios
hitters_path = 'ETL_Data/Agent/First_Two_Years_Contract/Period_t_1/Free_Agent/Hitters/free_agents_batters_'
pitchers_path = 'ETL_Data/Agent/First_Two_Years_Contract/Period_t_1/Free_Agent/Pitchers/free_agents_pitchers_'
# Originales:
df_pitchers = [None]*period
df_pitchers_copy = [None]*period
df_hitters = [None]*period
df_hitters_copy = [None]*period

Leamos todos las bases de datos correspondientes

In [7]:
for year in range(0,period):
    df_pitchers[year] = pd.read_csv(pitchers_path + str(starting_year + year) + csv)
    df_pitchers_copy[year] = df_pitchers[year].copy()
    
    df_hitters[year] = pd.read_csv(hitters_path + str(starting_year + year) + csv)
    df_hitters_copy[year] = df_hitters[year].copy()

Observemos el contenido de las bases de datos

In [8]:
# Pitchers
for year in range(0,period):
    print(df_pitchers_copy[year].shape)

(1, 136)
(47, 136)
(70, 136)
(83, 136)
(86, 136)
(63, 136)
(68, 136)
(64, 136)
(67, 136)
(69, 136)
(107, 136)


In [9]:
list(enumerate(df_pitchers_copy[5].columns))

[(0, 'Acronimo_t'),
 (1, 'Acronimo_t_1'),
 (2, 'Altura_t'),
 (3, 'Altura_t_1'),
 (4, 'Anio_de_agente_libre_t'),
 (5, 'Anio_de_agente_libre_t_1'),
 (6, 'Anio_t'),
 (7, 'Anios_de_contrato'),
 (8, 'Anios_de_contrato_t'),
 (9, 'Anios_de_contrato_t_1'),
 (10, 'Antiguedad_t'),
 (11, 'Antiguedad_t_1'),
 (12, 'Bateos_en_contra_2_t'),
 (13, 'Bateos_en_contra_2_t_1'),
 (14, 'Bateos_en_contra_t'),
 (15, 'Bateos_en_contra_t_1'),
 (16, 'Bono_por_firma_t'),
 (17, 'Bono_por_firma_t_1'),
 (18, 'Cantidad_agentes_libres_t'),
 (19, 'Cantidad_agentes_libres_t_1'),
 (20, 'Cantidad_de_equipos_t'),
 (21, 'Cantidad_de_equipos_t_1'),
 (22, 'Carreras_en_contra_2_t'),
 (23, 'Carreras_en_contra_2_t_1'),
 (24, 'Carreras_en_contra_t'),
 (25, 'Carreras_en_contra_t_1'),
 (26, 'Carreras_ganadas_2_t'),
 (27, 'Carreras_ganadas_2_t_1'),
 (28, 'Carreras_ganadas_t'),
 (29, 'Carreras_ganadas_t_1'),
 (30, 'Comando_2_t'),
 (31, 'Comando_2_t_1'),
 (32, 'Comando_t'),
 (33, 'Comando_t_1'),
 (34, 'Control_2_t'),
 (35, 'Control_2_

In [10]:
# Hitters
for year in range(0,period):
    print(df_hitters_copy[year].shape)

(0, 126)
(70, 126)
(91, 126)
(94, 126)
(110, 126)
(82, 126)
(70, 126)
(70, 126)
(48, 126)
(42, 126)
(65, 126)


Veamos las columnas con sus índices

In [11]:
list(enumerate(df_hitters_copy[2].columns))

[(0, 'Acronimo_t'),
 (1, 'Acronimo_t_1'),
 (2, 'Altura_t'),
 (3, 'Altura_t_1'),
 (4, 'Anio_de_agente_libre_t'),
 (5, 'Anio_de_agente_libre_t_1'),
 (6, 'Anio_t'),
 (7, 'Anios_de_contrato'),
 (8, 'Anios_de_contrato_t'),
 (9, 'Anios_de_contrato_t_1'),
 (10, 'Antiguedad_t'),
 (11, 'Antiguedad_t_1'),
 (12, 'At_bats_2_t'),
 (13, 'At_bats_2_t_1'),
 (14, 'At_bats_t'),
 (15, 'At_bats_t_1'),
 (16, 'Bateos_2_t'),
 (17, 'Bateos_2_t_1'),
 (18, 'Bateos_promedio_2_t'),
 (19, 'Bateos_promedio_2_t_1'),
 (20, 'Bateos_promedio_t'),
 (21, 'Bateos_promedio_t_1'),
 (22, 'Bateos_t'),
 (23, 'Bateos_t_1'),
 (24, 'Bono_por_firma_t'),
 (25, 'Bono_por_firma_t_1'),
 (26, 'Cantidad_agentes_libres_t'),
 (27, 'Cantidad_agentes_libres_t_1'),
 (28, 'Cantidad_de_equipos_t'),
 (29, 'Cantidad_de_equipos_t_1'),
 (30, 'Dobles_2_t'),
 (31, 'Dobles_2_t_1'),
 (32, 'Dobles_t'),
 (33, 'Dobles_t_1'),
 (34, 'Edad_al_firmar_t'),
 (35, 'Edad_al_firmar_t_1'),
 (36, 'Edad_t'),
 (37, 'Equipo_anterior'),
 (38, 'Equipo_t'),
 (39, 'Equipo

Veamos si hay bases de datos con variables que distintas entre sí (sin contar la primera)

In [12]:
k = 0
for year in range(0, period - 1):
    if set(df_pitchers_copy[year].columns) != set(df_pitchers_copy[year + 1].columns):
        k = k + 1
print("Número de dataframes con datos distintos: ",colored(k, "cyan"))    

Número de dataframes con datos distintos:  [36m0[0m


In [13]:
k = 0
for year in range(0, period - 1):
    if set(df_hitters_copy[year].columns) != set(df_hitters_copy[year + 1].columns):
        k = k + 1
print("Número de dataframes con datos distintos: ",colored(k, "cyan"))    

Número de dataframes con datos distintos:  [36m0[0m


### Construcción de la base de datos para el modelo

#### Base de datos de los equipos

Como no hay ninguna base de datos distintas, hagamos un algoritmo que construya un dataframe que solo contenga a las variables $X$ y $Y$. Partamos de ver los datos que contiene cualquiera de los dataframes para orientar el algoritmo a uno de búsqueda dado el nombre de la variable, esto se hará tanto para *pitchers* como para los *hitters*.

In [14]:
# Names of variables in both dataframes
pitchers_variables = df_pitchers_copy[0].columns
hitters_variables = df_hitters_copy[0].columns

Lo que haremos es inicializar dos variables para cada dataframe: Nombre de la variable a buscar ($X$ y $Y$) y un dataframe vacío.

In [15]:
X_raw_hitter = 'Home_runs_t_1'
X_raw_pitcher = 'Inning_pitched_t_1'
Y_raw = 'Sueldo_ajustado_t'
jugador = 'Jugador'
anio = 'Anio_t'
YX_pitcher_raw = [None]*period
YX_hitter_raw = [None]*period

In [16]:
for year in range(0, period):
    YX_pitcher_raw[year] = df_pitchers_copy[year][[jugador, anio, Y_raw, X_raw_pitcher]]
    
    YX_hitter_raw[year] = df_hitters_copy[year][[jugador, anio, Y_raw, X_raw_hitter]]

In [17]:
# Hitters
for year in range(0,period):
    print(YX_hitter_raw[year].shape)

(0, 4)
(70, 4)
(91, 4)
(94, 4)
(110, 4)
(82, 4)
(70, 4)
(70, 4)
(48, 4)
(42, 4)
(65, 4)


In [18]:
# Pitchers
for year in range(0,period):
    print(YX_pitcher_raw[year].shape)

(1, 4)
(47, 4)
(70, 4)
(83, 4)
(86, 4)
(63, 4)
(68, 4)
(64, 4)
(67, 4)
(69, 4)
(107, 4)


Ajustemos los precios por la inflación y cambiemos los nombres para facilitar el proceso *ETL*

In [19]:
cpi = {2012:1.31, 2013:1.29,
       2014:1.27, 2015:1.27,
       2016:1.26, 2017:1.23,
       2018:1.20, 2019:1.18,
       2020:1.15, 2021:1.14,
       2022:1}

In [20]:
for year in range(0, period):
    YX_hitter_raw[year][[Y_raw]] = YX_hitter_raw[year][[Y_raw]]*cpi[starting_year + year]
    YX_hitter_raw[year].rename(columns = {Y_raw: Y_raw + '_' + str(starting_year + year)}, inplace = True)
    YX_hitter_raw[year].rename(columns = {anio: anio + '_' + str(starting_year + year)}, inplace = True)
    YX_hitter_raw[year].rename(columns = {X_raw_hitter: X_raw_hitter + '_' + str(starting_year + year)}, inplace = True)

    YX_pitcher_raw[year][[Y_raw]] = YX_pitcher_raw[year][[Y_raw]]*cpi[starting_year + year]
    YX_pitcher_raw[year].rename(columns = {Y_raw: Y_raw + '_' + str(starting_year + year)}, inplace = True)
    YX_pitcher_raw[year].rename(columns = {anio: anio + '_' + str(starting_year + year)}, inplace = True)
    YX_pitcher_raw[year].rename(columns = {X_raw_pitcher: X_raw_pitcher + '_' + str(starting_year + year)}, inplace = True)

Observemos los resultados

In [21]:
# Hitters
for year in range(0,period):
    print(YX_hitter_raw[year].info())

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Jugador                 0 non-null      object
 1   Anio_t_2012             0 non-null      object
 2   Sueldo_ajustado_t_2012  0 non-null      object
 3   Home_runs_t_1_2012      0 non-null      object
dtypes: object(4)
memory usage: 0.0+ bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Jugador                 70 non-null     object 
 1   Anio_t_2013             70 non-null     int64  
 2   Sueldo_ajustado_t_2013  70 non-null     float64
 3   Home_runs_t_1_2013      70 non-null     int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 2.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91 entries, 0 to

In [22]:
# Pitchers
for year in range(0,period):
    print(YX_pitcher_raw[year].info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Jugador                  1 non-null      object 
 1   Anio_t_2012              1 non-null      int64  
 2   Sueldo_ajustado_t_2012   1 non-null      float64
 3   Inning_pitched_t_1_2012  1 non-null      float64
dtypes: float64(2), int64(1), object(1)
memory usage: 160.0+ bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Jugador                  47 non-null     object 
 1   Anio_t_2013              47 non-null     int64  
 2   Sueldo_ajustado_t_2013   47 non-null     float64
 3   Inning_pitched_t_1_2013  47 non-null     float64
dtypes: float64(2), int64(1), object(1)
memory usage: 1.6+ KB
None
<class 'pan

### Filtrado de datos

Contruiremos dos bases de datos tipo panel con el objetivo de facilitar la limpieza de datos

In [23]:
YX_pitcher_panel_raw = pd.merge(YX_pitcher_raw[0], YX_pitcher_raw[1], how = 'outer', on = 'Jugador')

for year in range(2,period):
    YX_pitcher_panel_raw = pd.merge(YX_pitcher_panel_raw, YX_pitcher_raw[year], how = 'outer', on = 'Jugador')

In [24]:
YX_pitcher_panel_raw.drop_duplicates(subset = 'Jugador', inplace = True)
YX_pitcher_panel_raw = YX_pitcher_panel_raw.sort_values(by = 'Jugador', ascending = True)
YX_pitcher_panel_raw.reset_index(drop = True, inplace = True)
YX_pitcher_panel_raw.sort_index(axis = 1, inplace = True)
YX_pitcher_panel_raw.tail()

Unnamed: 0,Anio_t_2012,Anio_t_2013,Anio_t_2014,Anio_t_2015,Anio_t_2016,Anio_t_2017,Anio_t_2018,Anio_t_2019,Anio_t_2020,Anio_t_2021,...,Sueldo_ajustado_t_2013,Sueldo_ajustado_t_2014,Sueldo_ajustado_t_2015,Sueldo_ajustado_t_2016,Sueldo_ajustado_t_2017,Sueldo_ajustado_t_2018,Sueldo_ajustado_t_2019,Sueldo_ajustado_t_2020,Sueldo_ajustado_t_2021,Sueldo_ajustado_t_2022
298,,2013.0,2014.0,2015.0,2016.0,2017.0,,,,,...,107076.45,1079500.0,5715000.0,2203277.58,6765000.0,,,,,
299,,,2014.0,,,,,,,,...,,461501.49,,,,,,,,
300,,,,,,,,2019.0,2020.0,2021.0,...,,,,,,,15340000.0,5537037.25,14820000.0,
301,,2013.0,2014.0,2015.0,2016.0,2017.0,2018.0,,,,...,24510000.0,33020000.0,31750000.0,42840000.0,41820000.0,40800000.0,,,,13000000.0
302,,,,,,,,,2020.0,,...,,,,,,,,9157407.45,,


Para facilitar los algoritmos, movamos la columna que contiene el nombre del jugador en la primera posición.

In [25]:
player_column_pitcher = YX_pitcher_panel_raw.pop('Jugador')
YX_pitcher_panel_raw.insert(0, 'Jugador', player_column_pitcher)
YX_pitcher_panel_raw.tail()

Unnamed: 0,Jugador,Anio_t_2012,Anio_t_2013,Anio_t_2014,Anio_t_2015,Anio_t_2016,Anio_t_2017,Anio_t_2018,Anio_t_2019,Anio_t_2020,...,Sueldo_ajustado_t_2013,Sueldo_ajustado_t_2014,Sueldo_ajustado_t_2015,Sueldo_ajustado_t_2016,Sueldo_ajustado_t_2017,Sueldo_ajustado_t_2018,Sueldo_ajustado_t_2019,Sueldo_ajustado_t_2020,Sueldo_ajustado_t_2021,Sueldo_ajustado_t_2022
298,Zach Duke,,2013.0,2014.0,2015.0,2016.0,2017.0,,,,...,107076.45,1079500.0,5715000.0,2203277.58,6765000.0,,,,,
299,Zach Putnam,,,2014.0,,,,,,,...,,461501.49,,,,,,,,
300,Zack Britton,,,,,,,,2019.0,2020.0,...,,,,,,,15340000.0,5537037.25,14820000.0,
301,Zack Greinke,,2013.0,2014.0,2015.0,2016.0,2017.0,2018.0,,,...,24510000.0,33020000.0,31750000.0,42840000.0,41820000.0,40800000.0,,,,13000000.0
302,Zack Wheeler,,,,,,,,,2020.0,...,,,,,,,,9157407.45,,


In [26]:
# Hitters
for year in range(0,period):
    print(YX_hitter_raw[year].info())

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Jugador                 0 non-null      object
 1   Anio_t_2012             0 non-null      object
 2   Sueldo_ajustado_t_2012  0 non-null      object
 3   Home_runs_t_1_2012      0 non-null      object
dtypes: object(4)
memory usage: 0.0+ bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Jugador                 70 non-null     object 
 1   Anio_t_2013             70 non-null     int64  
 2   Sueldo_ajustado_t_2013  70 non-null     float64
 3   Home_runs_t_1_2013      70 non-null     int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 2.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91 entries, 0 to

In [27]:
# Hitters
for year in range(0,period):
    print(YX_pitcher_raw[year].info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Jugador                  1 non-null      object 
 1   Anio_t_2012              1 non-null      int64  
 2   Sueldo_ajustado_t_2012   1 non-null      float64
 3   Inning_pitched_t_1_2012  1 non-null      float64
dtypes: float64(2), int64(1), object(1)
memory usage: 160.0+ bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Jugador                  47 non-null     object 
 1   Anio_t_2013              47 non-null     int64  
 2   Sueldo_ajustado_t_2013   47 non-null     float64
 3   Inning_pitched_t_1_2013  47 non-null     float64
dtypes: float64(2), int64(1), object(1)
memory usage: 1.6+ KB
None
<class 'pan

Lo que ahora queremos es dividir el dataframe en la variable dependiente e independiente, para ello crearemos un número guía que guarde el índice de la columna a partir de la cual se hará la división. Este índice de guía será cierto para todo tipo de variables mientras el periodo de análisis sea el mismo

In [28]:
split_column_pitcher = 1 + 2*period
print("Nombres ordenados: ", "\n", colored(YX_pitcher_panel_raw.columns, "green"), "\n")
print("Con índice: ", "\n", colored(list(enumerate(YX_pitcher_panel_raw.columns)), "green"), "\n")
print("Índice de columna guía: ", colored(split_column_pitcher, "blue"), '\n')

Nombres ordenados:  
 [32mIndex(['Jugador', 'Anio_t_2012', 'Anio_t_2013', 'Anio_t_2014', 'Anio_t_2015',
       'Anio_t_2016', 'Anio_t_2017', 'Anio_t_2018', 'Anio_t_2019',
       'Anio_t_2020', 'Anio_t_2021', 'Anio_t_2022', 'Inning_pitched_t_1_2012',
       'Inning_pitched_t_1_2013', 'Inning_pitched_t_1_2014',
       'Inning_pitched_t_1_2015', 'Inning_pitched_t_1_2016',
       'Inning_pitched_t_1_2017', 'Inning_pitched_t_1_2018',
       'Inning_pitched_t_1_2019', 'Inning_pitched_t_1_2020',
       'Inning_pitched_t_1_2021', 'Inning_pitched_t_1_2022',
       'Sueldo_ajustado_t_2012', 'Sueldo_ajustado_t_2013',
       'Sueldo_ajustado_t_2014', 'Sueldo_ajustado_t_2015',
       'Sueldo_ajustado_t_2016', 'Sueldo_ajustado_t_2017',
       'Sueldo_ajustado_t_2018', 'Sueldo_ajustado_t_2019',
       'Sueldo_ajustado_t_2020', 'Sueldo_ajustado_t_2021',
       'Sueldo_ajustado_t_2022'],
      dtype='object')[0m 

Con índice:  
 [32m[(0, 'Jugador'), (1, 'Anio_t_2012'), (2, 'Anio_t_2013'), (3, 'Anio_

Esta columna guía es a partir donde iniciará la otra variable.

Guardaremos los índices de los jugadores que contengan menos de dos datos

In [29]:
max_nan = 2
print("Máximo número de 'nan' que pueden haber en un renglón: ", colored(max_nan, "red"))

Máximo número de 'nan' que pueden haber en un renglón:  [31m2[0m


In [30]:
pitchers_ini_len = YX_pitcher_panel_raw.shape[0]
print("Cantidad de renglones iniciales para la variable independiente: ", colored(pitchers_ini_len, "green"))

Cantidad de renglones iniciales para la variable independiente:  [32m303[0m


Dependiendo del orden alfabético relativo entre **X_raw_hitter** y **Y_raw**.

In [31]:
if Y_raw > X_raw_pitcher:
    X_pitcher_drop_etl = YX_pitcher_panel_raw.iloc[:, 1 + period:split_column_pitcher]
else:
    X_pitcher_drop_etl = YX_pitcher_panel_raw.iloc[:, split_column_pitcher:YX_pitcher_panel_raw.shape[1]]

In [32]:
X_pitcher_drop_etl.head()

Unnamed: 0,Inning_pitched_t_1_2012,Inning_pitched_t_1_2013,Inning_pitched_t_1_2014,Inning_pitched_t_1_2015,Inning_pitched_t_1_2016,Inning_pitched_t_1_2017,Inning_pitched_t_1_2018,Inning_pitched_t_1_2019,Inning_pitched_t_1_2020,Inning_pitched_t_1_2021,Inning_pitched_t_1_2022
0,,,191.0,213.7,,,,,,,
1,,179.7,,204.3,,,,,,,
2,,,,,,,,77.7,66.3,18.3,62.0
3,,,,,,,,,171.7,65.7,
4,,,,,,,,51.7,,,


In [33]:
nulls_quantity = period - max_nan
print(nulls_quantity)
X_pitcher_drop_etl = X_pitcher_drop_etl.iloc[X_pitcher_drop_etl[(X_pitcher_drop_etl.isnull().sum(axis = 1) > nulls_quantity)].index]

9


In [34]:
X_pitcher_drop_etl.head()

Unnamed: 0,Inning_pitched_t_1_2012,Inning_pitched_t_1_2013,Inning_pitched_t_1_2014,Inning_pitched_t_1_2015,Inning_pitched_t_1_2016,Inning_pitched_t_1_2017,Inning_pitched_t_1_2018,Inning_pitched_t_1_2019,Inning_pitched_t_1_2020,Inning_pitched_t_1_2021,Inning_pitched_t_1_2022
4,,,,,,,,51.7,,,
5,,,,,,,76.0,,,,
9,,,,,,,,,62.0,,
10,,,,25.0,,,,,,,
11,,,,,187.0,,,,,,


In [35]:
print("Cantidad de jugadores que no cumplen la condición para la variable independiente: \n", colored(X_pitcher_drop_etl.shape[0], "red"))

Cantidad de jugadores que no cumplen la condición para la variable independiente: 
 [31m121[0m


Crearemos un *array* para guardar los índices de los renglones que estén en el dataframe *X_drop_etl*. Son los que no cumplen la condición mencionada anteriormente

In [37]:
X_pitcher_index_drop = X_pitcher_drop_etl.index

Repetiremos el mismo proceso para la variable del salario ajustado de los pitcher

In [38]:
if Y_raw > X_raw_pitcher:
    Y_pitcher_drop_etl = YX_pitcher_panel_raw.iloc[:, split_column_pitcher:YX_pitcher_panel_raw.shape[1]]
else:
    Y_pitcher_drop_etl = YX_pitcher_panel_raw.iloc[:, 1 + period:split_column_pitcher]

In [39]:
Y_pitcher_drop_etl.head()

Unnamed: 0,Sueldo_ajustado_t_2012,Sueldo_ajustado_t_2013,Sueldo_ajustado_t_2014,Sueldo_ajustado_t_2015,Sueldo_ajustado_t_2016,Sueldo_ajustado_t_2017,Sueldo_ajustado_t_2018,Sueldo_ajustado_t_2019,Sueldo_ajustado_t_2020,Sueldo_ajustado_t_2021,Sueldo_ajustado_t_2022
0,,,14287500.0,10795000.0,,,,,,,
1,,7095000.0,,6350000.0,,,,,,,
2,,,,,,,,10620000.0,4557407.45,8236550.16,5000000.0
3,,,,,,,,,3407407.45,9120000.0,
4,,,,,,,,2360000.0,,,


In [40]:
Y_pitcher_drop_etl = Y_pitcher_drop_etl.iloc[Y_pitcher_drop_etl[(Y_pitcher_drop_etl.isnull().sum(axis = 1) > nulls_quantity)].index]
Y_pitcher_index_drop = Y_pitcher_drop_etl.index

Luego, eleminemos esos índices del dataframe del panel data

In [41]:
pitcher_index_drop = list(set(Y_pitcher_index_drop).union(set(X_pitcher_index_drop)))

In [42]:
print("Cantidad de jugadores que se eliminarán: \n", colored(len(pitcher_index_drop), "red"))

Cantidad de jugadores que se eliminarán: 
 [31m121[0m


In [43]:
YX_pitcher_panel_raw.drop(index = pitcher_index_drop, inplace = True)
YX_pitcher_panel_raw.reset_index(drop = True, inplace = True)
YX_pitcher_panel_raw = YX_pitcher_panel_raw.sort_values(by = 'Jugador', ascending = True)
print("Cantidad de jugadores restantes en total: \n", colored(YX_pitcher_panel_raw.shape, "cyan"))

Cantidad de jugadores restantes en total: 
 [36m(182, 34)[0m


In [44]:
YX_pitcher_panel_raw.head()

Unnamed: 0,Jugador,Anio_t_2012,Anio_t_2013,Anio_t_2014,Anio_t_2015,Anio_t_2016,Anio_t_2017,Anio_t_2018,Anio_t_2019,Anio_t_2020,...,Sueldo_ajustado_t_2013,Sueldo_ajustado_t_2014,Sueldo_ajustado_t_2015,Sueldo_ajustado_t_2016,Sueldo_ajustado_t_2017,Sueldo_ajustado_t_2018,Sueldo_ajustado_t_2019,Sueldo_ajustado_t_2020,Sueldo_ajustado_t_2021,Sueldo_ajustado_t_2022
0,A.J. Burnett,,,2014.0,2015.0,,,,,,...,,14287500.0,10795000.0,,,,,,,
1,Aaron Harang,,2013.0,,2015.0,,,,,,...,7095000.0,,6350000.0,,,,,,,
2,Adam Ottavino,,,,,,,,2019.0,2020.0,...,,,,,,,10620000.0,4557407.45,8236550.16,5000000.0
3,Adam Wainwright,,,,,,,,,2020.0,...,,,,,,,,3407407.45,9120000.0,
4,Alex Cobb,,,,,,,2018.0,2019.0,2020.0,...,,,,,,9000000.0,11210000.0,5269515.05,,9000000.0


In [45]:
print("Tamaño: \n", colored(YX_pitcher_panel_raw.shape, "cyan"))

Tamaño: 
 [36m(182, 34)[0m


In [46]:
YX_pitcher_panel_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 182 entries, 0 to 181
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Jugador                  182 non-null    object 
 1   Anio_t_2012              1 non-null      float64
 2   Anio_t_2013              37 non-null     float64
 3   Anio_t_2014              58 non-null     float64
 4   Anio_t_2015              64 non-null     float64
 5   Anio_t_2016              75 non-null     float64
 6   Anio_t_2017              56 non-null     float64
 7   Anio_t_2018              61 non-null     float64
 8   Anio_t_2019              54 non-null     float64
 9   Anio_t_2020              55 non-null     float64
 10  Anio_t_2021              58 non-null     float64
 11  Anio_t_2022              44 non-null     float64
 12  Inning_pitched_t_1_2012  1 non-null      float64
 13  Inning_pitched_t_1_2013  37 non-null     float64
 14  Inning_pitched_t_1_2014  5

Debido a que pueden haber estadísticas deportivas que sean iguales a cero, es necesario tratar dicho datos. En el caso de las estadísticas analizadas, no hay valores negativos, por lo que no tenemos que tratar dichos datos. **POR EL MOMENTO SOLO SE ELIMINARÁN DICHOS RENGLONES, PERO DESPUÉS SE HARÁ UNA TRANSFORMACIÓN PARA NO PERDER ESOS DATOS.** Creemos una columna que alamcene los máximos por renglón de la estadística deportiva para después filtrar los renglones cuyo máximo sea $0$.

In [47]:
if Y_raw > X_raw_pitcher:
    YX_pitcher_panel_raw[X_raw_pitcher + '_max'] = YX_pitcher_panel_raw.iloc[:, 1 + period:split_column_pitcher].max(axis = 1)
else:
    YX_pitcher_panel_raw[X_raw_pitcher + '_max'] = YX_pitcher_panel_raw.iloc[:, split_column_pitcher:YX_pitcher_panel_raw.shape[1]].max(axis = 1)

In [48]:
YX_pitcher_max_0 = YX_pitcher_panel_raw[YX_pitcher_panel_raw[X_raw_pitcher + '_max'] == 0]

In [49]:
YX_pitcher_max_0.head()

Unnamed: 0,Jugador,Anio_t_2012,Anio_t_2013,Anio_t_2014,Anio_t_2015,Anio_t_2016,Anio_t_2017,Anio_t_2018,Anio_t_2019,Anio_t_2020,...,Sueldo_ajustado_t_2014,Sueldo_ajustado_t_2015,Sueldo_ajustado_t_2016,Sueldo_ajustado_t_2017,Sueldo_ajustado_t_2018,Sueldo_ajustado_t_2019,Sueldo_ajustado_t_2020,Sueldo_ajustado_t_2021,Sueldo_ajustado_t_2022,Inning_pitched_t_1_max


Independientemente de que haya o no filas que cumplan esta condición, podremos ejecutar la intrucción para borrar dichas filas

In [50]:
YX_pitcher_panel_raw.drop(columns = X_raw_pitcher + '_max', inplace = True)

In [51]:
YX_pitcher_max_0_index_drop = YX_pitcher_max_0.index

In [52]:
YX_pitcher_panel_raw.drop(index = YX_pitcher_max_0_index_drop, inplace = True)
YX_pitcher_panel_raw.reset_index(drop = True, inplace = True)
print("Cantidad de jugadores restantes en total: \n", colored(YX_pitcher_panel_raw.shape, "cyan"))

Cantidad de jugadores restantes en total: 
 [36m(182, 34)[0m


In [53]:
YX_pitcher_panel_raw.head()

Unnamed: 0,Jugador,Anio_t_2012,Anio_t_2013,Anio_t_2014,Anio_t_2015,Anio_t_2016,Anio_t_2017,Anio_t_2018,Anio_t_2019,Anio_t_2020,...,Sueldo_ajustado_t_2013,Sueldo_ajustado_t_2014,Sueldo_ajustado_t_2015,Sueldo_ajustado_t_2016,Sueldo_ajustado_t_2017,Sueldo_ajustado_t_2018,Sueldo_ajustado_t_2019,Sueldo_ajustado_t_2020,Sueldo_ajustado_t_2021,Sueldo_ajustado_t_2022
0,A.J. Burnett,,,2014.0,2015.0,,,,,,...,,14287500.0,10795000.0,,,,,,,
1,Aaron Harang,,2013.0,,2015.0,,,,,,...,7095000.0,,6350000.0,,,,,,,
2,Adam Ottavino,,,,,,,,2019.0,2020.0,...,,,,,,,10620000.0,4557407.45,8236550.16,5000000.0
3,Adam Wainwright,,,,,,,,,2020.0,...,,,,,,,,3407407.45,9120000.0,
4,Alex Cobb,,,,,,,2018.0,2019.0,2020.0,...,,,,,,9000000.0,11210000.0,5269515.05,,9000000.0


Puesto que ya nos encargamos de toda la limpieza y tratamiento, ya podremos crear las variables $X$ y $Y$ para la regresión.

### Variable $X$

$$
X = \left( -1 \right)^{I_{t}^{-}}\frac{y_{t}}{\sqrt{y_H}}
$$

Obtengamos la variable $X$. Hallemos primero los máximos y mínimos por renglón.

In [54]:
if Y_raw > X_raw_pitcher:
    X_pitcher_etl = YX_pitcher_panel_raw.iloc[:, 1 + period:split_column_pitcher]
else:
    X_pitcher_etl = YX_pitcher_panel_raw.iloc[:, split_column_pitcher:YX_pitcher_panel_raw.shape[1]]

In [55]:
pitcher_max_element = X_pitcher_etl.max(axis = 1)
pitcher_min_element = X_pitcher_etl.min(axis = 1)

In [56]:
pitcher_max_min = (pitcher_max_element + pitcher_min_element)/2

Generemos las listas que contengan los nombres de las nuevas variables: La dummy auxiliar $I$ y $X$.

In [57]:
dummy_pitcher_names = []
X_pitcher_names = []

for year in range(0,period):
    dummy_pitcher_names.append("I_" + str(starting_year + year))
    X_pitcher_names.append("X_" + str(starting_year + year))

In [58]:
print(dummy_pitcher_names)
print()
print(X_pitcher_names)

['I_2012', 'I_2013', 'I_2014', 'I_2015', 'I_2016', 'I_2017', 'I_2018', 'I_2019', 'I_2020', 'I_2021', 'I_2022']

['X_2012', 'X_2013', 'X_2014', 'X_2015', 'X_2016', 'X_2017', 'X_2018', 'X_2019', 'X_2020', 'X_2021', 'X_2022']


Ahora, creemos la variable dummy $I$

In [59]:
for year in range(0,period):
    X_pitcher_conditions = [
    (X_pitcher_etl.iloc[:,year] == pitcher_max_element) & ~(X_pitcher_etl.iloc[:,year].isnull()),
    (X_pitcher_etl.iloc[:,year] != pitcher_max_element) & ~(X_pitcher_etl.iloc[:,year].isnull())
    ]

    X_pitcher_conditions_values = [0,1]
    
    X_pitcher_etl[dummy_pitcher_names[year]] = np.select(X_pitcher_conditions, X_pitcher_conditions_values, default = np.nan)

In [60]:
X_pitcher_etl.columns

Index(['Inning_pitched_t_1_2012', 'Inning_pitched_t_1_2013',
       'Inning_pitched_t_1_2014', 'Inning_pitched_t_1_2015',
       'Inning_pitched_t_1_2016', 'Inning_pitched_t_1_2017',
       'Inning_pitched_t_1_2018', 'Inning_pitched_t_1_2019',
       'Inning_pitched_t_1_2020', 'Inning_pitched_t_1_2021',
       'Inning_pitched_t_1_2022', 'I_2012', 'I_2013', 'I_2014', 'I_2015',
       'I_2016', 'I_2017', 'I_2018', 'I_2019', 'I_2020', 'I_2021', 'I_2022'],
      dtype='object')

Veamos las columnas de las dummies

In [61]:
X_pitcher_etl.iloc[:,period:]

Unnamed: 0,I_2012,I_2013,I_2014,I_2015,I_2016,I_2017,I_2018,I_2019,I_2020,I_2021,I_2022
0,,,1.0,0.0,,,,,,,
1,,1.0,,0.0,,,,,,,
2,,,,,,,,0.0,1.0,1.0,1.0
3,,,,,,,,,0.0,1.0,
4,,,,,,,0.0,1.0,1.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...
177,,,,,,,0.0,1.0,1.0,,
178,,,,,1.0,,1.0,0.0,,1.0,
179,,1.0,1.0,1.0,0.0,1.0,,,,,
180,,,,,,,,1.0,0.0,1.0,


Para facilitar los algoritmos, separaremos las variables sobre las estadísticas deportivas y dummies en dos dataframes distintos. Debido a cómo se construyeron estas variables en la base de datos, no es necesario preocuparnos por el orden alfabético relativo entre los nombres de las variables.

**Variables estadística:**

In [62]:
X_pitcher_aux = X_pitcher_etl.iloc[:,:period]
X_pitcher_aux.head()

Unnamed: 0,Inning_pitched_t_1_2012,Inning_pitched_t_1_2013,Inning_pitched_t_1_2014,Inning_pitched_t_1_2015,Inning_pitched_t_1_2016,Inning_pitched_t_1_2017,Inning_pitched_t_1_2018,Inning_pitched_t_1_2019,Inning_pitched_t_1_2020,Inning_pitched_t_1_2021,Inning_pitched_t_1_2022
0,,,191.0,213.7,,,,,,,
1,,179.7,,204.3,,,,,,,
2,,,,,,,,77.7,66.3,18.3,62.0
3,,,,,,,,,171.7,65.7,
4,,,,,,,179.3,152.3,12.3,,93.3


**Variables dummie:**

In [63]:
Dummy_pitcher = X_pitcher_etl.iloc[:,period:]
Dummy_pitcher.head()

Unnamed: 0,I_2012,I_2013,I_2014,I_2015,I_2016,I_2017,I_2018,I_2019,I_2020,I_2021,I_2022
0,,,1.0,0.0,,,,,,,
1,,1.0,,0.0,,,,,,,
2,,,,,,,,0.0,1.0,1.0,1.0
3,,,,,,,,,0.0,1.0,
4,,,,,,,0.0,1.0,1.0,,1.0


Usemos la variable dummy para crear finalmente a la variable $X$. Sin embargo, primero creemos una serie auxiliar que contenga el máximo de cada renglón

In [64]:
X_pitcher_max = X_pitcher_aux.max(axis = 1)
X_pitcher_aux[['Max']] = np.sqrt(X_pitcher_max)

In [65]:
for year in range(0,period):    
    X_pitcher_etl['X_' + str(starting_year + year)] = \
    (X_pitcher_aux[X_raw_pitcher + '_' + str(starting_year + year)]*
    ((-1)**Dummy_pitcher['I_' + str(starting_year + year)]))

In [66]:
X_pitcher_etl = X_pitcher_etl.div(X_pitcher_aux['Max'].values, axis = 0)

Como ya no nos interesa la variable sobre la estadística deportiva en crudo, solo conservaremos las columnas de la variables $X$

In [67]:
X_pitcher_etl = X_pitcher_etl.iloc[:,2*period:]

In [68]:
X_pitcher_etl.head()

Unnamed: 0,X_2012,X_2013,X_2014,X_2015,X_2016,X_2017,X_2018,X_2019,X_2020,X_2021,X_2022
0,,,-13.065653,14.618481,,,,,,,
1,,-12.572276,,14.293355,,,,,,,
2,,,,,,,,8.81476,-7.521475,-2.076063,-7.033657
3,,,,,,,,,13.103435,-5.013953,
4,,,,,,,13.390295,-11.373909,-0.918576,,-6.967733


Recorramos hasta la izquierda todos los valores del dataframe para facilitar los algoritmos

In [69]:
for i in range(0,X_pitcher_etl.shape[0]):
    row = X_pitcher_etl.iloc[i]
    row_length = row.size
    sustitute = []
    
    for j in range(row_length):
        if pd.isna(row[j]) != True:
            sustitute.append(row[j])
            
    sustitute_length = len(sustitute)
    
    for k in range(row_length - sustitute_length):
        sustitute.append(np.nan)
        
    for j in range(row_length):
        X_pitcher_etl.iloc[i,j] = sustitute[j]

X_pitcher_etl.columns = ['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 
                         'X_7', 'X_8', 'X_9', 'X_10', 'X_11']

In [70]:
X_pitcher_etl.head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11
0,-13.065653,14.618481,,,,,,,,,
1,-12.572276,14.293355,,,,,,,,,
2,8.81476,-7.521475,-2.076063,-7.033657,,,,,,,
3,13.103435,-5.013953,,,,,,,,,
4,13.390295,-11.373909,-0.918576,-6.967733,,,,,,,


Como tiene las mismas filas que el dataframe **YX_pitcher_panel_raw**, podemos añadir la columna de jugadores sin problemas

In [71]:
X_pitcher_etl[['Jugador']] = YX_pitcher_panel_raw[['Jugador']]
X_player_column_pitcher = X_pitcher_etl.pop('Jugador')
X_pitcher_etl.insert(0, 'Jugador', X_player_column_pitcher)
X_pitcher_etl.head()

Unnamed: 0,Jugador,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11
0,A.J. Burnett,-13.065653,14.618481,,,,,,,,,
1,Aaron Harang,-12.572276,14.293355,,,,,,,,,
2,Adam Ottavino,8.81476,-7.521475,-2.076063,-7.033657,,,,,,,
3,Adam Wainwright,13.103435,-5.013953,,,,,,,,,
4,Alex Cobb,13.390295,-11.373909,-0.918576,-6.967733,,,,,,,


### Variable $Y$

$$
Y = \sqrt{\omega_{t+1}} - \sqrt{\omega_{t}}
$$

Usaremos de nuevo un dataframe auziliar

In [72]:
if Y_raw > X_raw_pitcher:
    Y_pitcher_etl = YX_pitcher_panel_raw.iloc[:, split_column_pitcher:YX_pitcher_panel_raw.shape[1]]
else:
    Y_pitcher_etl = YX_pitcher_panel_raw.iloc[:, 1 + period:split_column_pitcher]

In [73]:
Y_pitcher_etl.head()

Unnamed: 0,Sueldo_ajustado_t_2012,Sueldo_ajustado_t_2013,Sueldo_ajustado_t_2014,Sueldo_ajustado_t_2015,Sueldo_ajustado_t_2016,Sueldo_ajustado_t_2017,Sueldo_ajustado_t_2018,Sueldo_ajustado_t_2019,Sueldo_ajustado_t_2020,Sueldo_ajustado_t_2021,Sueldo_ajustado_t_2022
0,,,14287500.0,10795000.0,,,,,,,
1,,7095000.0,,6350000.0,,,,,,,
2,,,,,,,,10620000.0,4557407.45,8236550.16,5000000.0
3,,,,,,,,,3407407.45,9120000.0,
4,,,,,,,9000000.0,11210000.0,5269515.05,,9000000.0


In [74]:
for i in range(0,Y_pitcher_etl.shape[0]):
    row = Y_pitcher_etl.iloc[i]
    row_length = row.size
    sustitute = []
    
    for j in range(row_length):
        if pd.isna(row[j]) != True:
            sustitute.append(row[j])
            
    sustitute_length = len(sustitute)
    
    for k in range(row_length - sustitute_length):
        sustitute.append(np.nan)
        
    for j in range(row_length):
        Y_pitcher_etl.iloc[i,j] = sustitute[j]

Y_pitcher_etl.columns = ['w_1', 'w_2', 'w_3', 'w_4', 'w_5', 'w_6', 
                         'w_7', 'w_8', 'w_9', 'w_10', 'w_11']

In [75]:
Y_pitcher_etl[['Jugador']] = YX_pitcher_panel_raw[['Jugador']]
Y_player_column_pitcher = Y_pitcher_etl.pop('Jugador')
Y_pitcher_etl.insert(0, 'Jugador', Y_player_column_pitcher)
Y_pitcher_etl.head()

Unnamed: 0,Jugador,w_1,w_2,w_3,w_4,w_5,w_6,w_7,w_8,w_9,w_10,w_11
0,A.J. Burnett,14287500.0,10795000.0,,,,,,,,,
1,Aaron Harang,7095000.0,6350000.0,,,,,,,,,
2,Adam Ottavino,10620000.0,4557407.45,8236550.16,5000000.0,,,,,,,
3,Adam Wainwright,3407407.45,9120000.0,,,,,,,,,
4,Alex Cobb,9000000.0,11210000.0,5269515.05,9000000.0,,,,,,,


In [76]:
Y_pitcher_etl_copy = Y_pitcher_etl.copy()
y_etl_names = Y_pitcher_etl_copy['Jugador']

Apliquemos el algoritmo para clacular a $Y$

In [77]:
for i in range(0,len(y_etl_names)):
    row = Y_pitcher_etl_copy.iloc[i]
    row_length = row.size
    k = 0

    sustitute = []

    for j in range(0, row_length):
        if pd.isna(row[j]) != True:
            k = k + 1

    sustitute.append(y_etl_names[i])
    for j in range(1,k - 1):
        sustitute.append(row[j + 1]**0.5 - row[j]**0.5)

    sustitute_length = len(sustitute)

    for k in range(row_length - sustitute_length):
        sustitute.append(np.nan)

    for j in range(row_length):
        Y_pitcher_etl_copy.iloc[i,j] = sustitute[j]

Y_pitcher_etl_copy.columns = ['Jugador', 'Y_1', 'Y_2', 'Y_3', 'Y_4', 'Y_5', 'Y_6', 
                              'Y_7', 'Y_8', 'Y_9', 'Y_10', 'Y_11']

In [78]:
Y_pitcher_etl = Y_pitcher_etl_copy
Y_pitcher_etl.head()

Unnamed: 0,Jugador,Y_1,Y_2,Y_3,Y_4,Y_5,Y_6,Y_7,Y_8,Y_9,Y_10,Y_11
0,A.J. Burnett,-494.306419,,,,,,,,,,
1,Aaron Harang,-143.723487,,,,,,,,,,
2,Adam Ottavino,-1124.02562,735.130524,-633.871074,,,,,,,,
3,Adam Wainwright,1174.017349,,,,,,,,,,
4,Alex Cobb,348.133809,-1052.591378,704.45757,,,,,,,,


Verifiquemos que ambos dataframes contengan la misma cantidad de filas

In [79]:
print("Y_pitcher_etl: \n", colored(Y_pitcher_etl.shape, "cyan"))
print("X_pitcher_etl: \n", colored(X_pitcher_etl.shape, "cyan"))

Y_pitcher_etl: 
 [36m(182, 12)[0m
X_pitcher_etl: 
 [36m(182, 12)[0m


La estrategia para construir la base de datos para la regresión consiste en el siguiente proceso:
- Crear dataframes que contengan la columna del nombre del jugador y la variable de $X$ y $Y$ del mismo subíndice.
- Hacer merge de las bases de datos que contengan el mismo subíndice.
- Concatenar todas las bases de datos.
- Eliminar las filas que contengan datos faltantes.

In [80]:
regression_pitcher_list = [None]*(period-1)

In [81]:
for i in range(0,len(regression_pitcher_list)):
    merge = pd.merge(Y_pitcher_etl.iloc[:,[0,i+1]], X_pitcher_etl.iloc[:,[0,i+1]])
    regression_pitcher_list[i] = merge
    regression_pitcher_list[i].columns = ['Jugador', 'Y', 'X']

In [82]:
df_article_regression_pitcher = regression_pitcher_list[0]

for i in range(1,len(regression_pitcher_list)):
    df_article_regression_pitcher = pd.concat([df_article_regression_pitcher, regression_pitcher_list[i]])

In [83]:
df_article_regression_pitcher = df_article_regression_pitcher.sort_values(by = 'Jugador', ascending = True)
df_article_regression_pitcher.reset_index(drop = True, inplace = True)
df_article_regression_pitcher

Unnamed: 0,Jugador,Y,X
0,A.J. Burnett,-494.306419,-13.065653
1,A.J. Burnett,,
2,A.J. Burnett,,
3,A.J. Burnett,,
4,A.J. Burnett,,
...,...,...,...
1815,Zack Greinke,910.513434,-13.556132
1816,Zack Greinke,-111.589325,-11.907685
1817,Zack Greinke,795.545642,-14.226232
1818,Zack Greinke,,


Por último, eliminemos las filas que contengan filas con datos *NaN*.

In [84]:
df_article_regression_pitcher.dropna(inplace = True)
df_article_regression_pitcher.reset_index(drop = True, inplace = True)
df_article_regression_pitcher

Unnamed: 0,Jugador,Y,X
0,A.J. Burnett,-494.306419,-13.065653
1,Aaron Harang,-143.723487,-12.572276
2,Adam Ottavino,-1124.025620,8.814760
3,Adam Ottavino,-633.871074,-2.076063
4,Adam Ottavino,735.130524,-7.521475
...,...,...,...
376,Zack Greinke,-2781.936494,-13.556132
377,Zack Greinke,-78.388783,14.923136
378,Zack Greinke,910.513434,-13.556132
379,Zack Greinke,-111.589325,-11.907685


In [85]:
print("Cantidad de jugadores para la regresión: \n", colored(df_article_regression_pitcher.shape, "cyan"))

Cantidad de jugadores para la regresión: 
 [36m(381, 3)[0m


## Replicación del algoritmo

Ahora, repetiremos en un solo bloque de código el mismo proceso para las estadísticas deportivas de los hitters

In [86]:
starting_year = 2012
X_raw_hitter = 'Home-runs'
Y_raw = 'Sueldo'
jugador = 'Jugador'

In [82]:
# Creación de la base de datos a transformar las variables de la regresión
YX_hitter_raw = [None]*period
for year in range(0, period):
    YX_hitter_raw[year] = df_hitters_copy[year][[jugador, Y_raw, X_raw_hitter]]

# Ajuste de inflación en los salarios    
cpi = {2012:1.31, 2013:1.29,
       2014:1.27, 2015:1.27,
       2016:1.26, 2017:1.23,
       2018:1.20, 2019:1.18,
       2020:1.15, 2021:1.14,
       2022:1}

for year in range(0, period):
    YX_hitter_raw[year][[Y_raw]] = YX_hitter_raw[year][[Y_raw]]*cpi[starting_year + year]
    YX_hitter_raw[year].rename(columns = {Y_raw: Y_raw + '_' + str(starting_year + year)}, inplace = True)
    YX_hitter_raw[year].rename(columns = {anio: anio + '_' + str(starting_year + year)}, inplace = True)
    YX_hitter_raw[year].rename(columns = {X_raw_hitter: X_raw_hitter + '_' + str(starting_year + year)}, inplace = True)

# Filtrado de base de datos
YX_hitter_panel_raw = pd.merge(YX_hitter_raw[0], YX_hitter_raw[1], how = 'outer', on = 'Jugador')

for year in range(2,period):
    YX_hitter_panel_raw = pd.merge(YX_hitter_panel_raw, YX_hitter_raw[year], how = 'outer', on = 'Jugador')

YX_hitter_panel_raw.drop_duplicates(subset = 'Jugador', inplace = True)
YX_hitter_panel_raw = YX_hitter_panel_raw.sort_values(by = 'Jugador', ascending = True)
YX_hitter_panel_raw.reset_index(drop = True, inplace = True)
YX_hitter_panel_raw.sort_index(axis = 1, inplace = True)

# Recolocación al principio de Jugador
player_column_hitter = YX_hitter_panel_raw.pop('Jugador')
YX_hitter_panel_raw.insert(0, 'Jugador', player_column_hitter)

# Creación de base de datos auxiliares  para remover los NaN
split_column_hitter = 1 + period
max_nan = 2

if Y_raw > X_raw_hitter:
    X_hitter_drop_etl = YX_hitter_panel_raw.iloc[:, 1:split_column_hitter]
else:
    X_hitter_drop_etl = YX_hitter_panel_raw.iloc[:, split_column_hitter:X_hitter_drop_etl.shape[1]]

nulls_quantity = period - max_nan
X_hitter_drop_etl = X_hitter_drop_etl.iloc[X_hitter_drop_etl[(X_hitter_drop_etl.isnull().sum(axis = 1) > nulls_quantity)].index]
X_hitter_index_drop = X_hitter_drop_etl.index

if Y_raw > X_raw_hitter:
    Y_hitter_drop_etl = YX_hitter_panel_raw.iloc[:, split_column_hitter:YX_hitter_panel_raw.shape[1]]
else:
    Y_hitter_drop_etl = YX_hitter_panel_raw.iloc[:, 1:split_column_hitter]
Y_hitter_drop_etl = Y_hitter_drop_etl.iloc[Y_hitter_drop_etl[(Y_hitter_drop_etl.isnull().sum(axis = 1) > nulls_quantity)].index]
Y_hitter_index_drop = Y_hitter_drop_etl.index

# índice de los jugadores que no tienen más de dos observaciones en el periodo de analisis
hitter_index_drop = list(set(X_hitter_index_drop).union(set(Y_hitter_index_drop)))

YX_hitter_panel_raw.drop(index = hitter_index_drop, inplace = True)
YX_hitter_panel_raw.reset_index(drop = True, inplace = True)
YX_hitter_panel_raw = YX_hitter_panel_raw.sort_values(by = 'Jugador', ascending = True)

# Borremos provicionalmente a los jugadores cuyo estadística máxima es igual a 0
if Y_raw > X_raw_hitter:
    YX_hitter_panel_raw[X_raw_hitter + '_max'] = YX_hitter_panel_raw.iloc[:, 1:split_column_hitter].max(axis = 1)
else:
    YX_hitter_panel_raw[X_raw_hitter + '_max'] = YX_hitter_panel_raw.iloc[:, split_column_hitter:YX_hitter_panel_raw.shape[1]].max(axis = 1)
YX_hitter_max_0 = YX_hitter_panel_raw[YX_hitter_panel_raw[X_raw_hitter + '_max'] == 0]
YX_hitter_max_0_index_drop = YX_hitter_max_0.index
YX_hitter_panel_raw.drop(columns = X_raw_hitter + '_max', inplace = True)
YX_hitter_panel_raw.drop(index = YX_hitter_max_0_index_drop, inplace = True)
YX_hitter_panel_raw.reset_index(drop = True, inplace = True)

# Variable X
if Y_raw > X_raw_hitter:
    X_hitter_etl = YX_hitter_panel_raw.iloc[:, 1:split_column_hitter]
else:
    X_hitter_etl = YX_hitter_panel_raw.iloc[:, split_column_hitter:YX_hitter_panel_raw.shape[1]]
hitter_max_element = X_hitter_etl.max(axis = 1)
hitter_min_element = X_hitter_etl.min(axis = 1)
hitter_max_min = (hitter_max_element + hitter_min_element)/2
dummy_hitter_names = []
X_hitter_names = []
for year in range(0,period):
    dummy_hitter_names.append("I_" + str(2011 + year))
    X_hitter_names.append("X_" + str(2011 + year))
for year in range(0,period):
    X_hitter_conditions = [
    (X_hitter_etl.iloc[:,year] == hitter_max_element) & ~(X_hitter_etl.iloc[:,year].isnull()),
    (X_hitter_etl.iloc[:,year] != hitter_max_element) & ~(X_hitter_etl.iloc[:,year].isnull())
    ]
    X_hitter_conditions_values = [0,1]
    X_hitter_etl[dummy_hitter_names[year]] = np.select(X_hitter_conditions, X_hitter_conditions_values, default = np.nan)
X_hitter_aux = X_hitter_etl.iloc[:,:period]
Dummy_hitter = X_hitter_etl.iloc[:,period:]
X_hittter_max = X_hitter_aux.max(axis = 1)
X_hitter_aux[['Max']] = np.sqrt(X_hittter_max)
for year in range(0,period):    
    X_hitter_etl['X_' + str(2011 + year)] = \
    (X_hitter_aux[X_raw_hitter + '_' + str(2011 + year)]*
    ((-1)**Dummy_hitter['I_' + str(2011 + year)]))
X_hitter_etl = X_hitter_etl.div(X_hitter_aux['Max'].values, axis = 0)
X_hitter_etl = X_hitter_etl.iloc[:,2*period:]
for i in range(0,X_hitter_etl.shape[0]):
    row = X_hitter_etl.iloc[i]
    row_length = row.size
    sustitute = []
    
    for j in range(row_length):
        if pd.isna(row[j]) != True:
            sustitute.append(row[j])
            
    sustitute_length = len(sustitute)
    
    for k in range(row_length - sustitute_length):
        sustitute.append(np.nan)
        
    for j in range(row_length):
        X_hitter_etl.iloc[i,j] = sustitute[j]

X_hitter_etl.columns = ['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 
                         'X_7', 'X_8', 'X_9', 'X_10', 'X_11']
X_hitter_etl[['Jugador']] = YX_hitter_panel_raw[['Jugador']]
X_player_column_hitter = X_hitter_etl.pop('Jugador')
X_hitter_etl.insert(0, 'Jugador', X_player_column_hitter)

# Variable Y
if Y_raw > X_raw_hitter:
    Y_hitter_etl = YX_hitter_panel_raw.iloc[:, split_column_hitter:YX_hitter_panel_raw.shape[1]]
else:
    Y_hitter_etl = YX_hitter_panel_raw.iloc[:, 1:split_column_hitter]
for i in range(0,Y_hitter_etl.shape[0]):
    row = Y_hitter_etl.iloc[i]
    row_length = row.size
    sustitute = []
    
    for j in range(row_length):
        if pd.isna(row[j]) != True:
            sustitute.append(row[j])
            
    sustitute_length = len(sustitute)
    
    for k in range(row_length - sustitute_length):
        sustitute.append(np.nan)
        
    for j in range(row_length):
        Y_hitter_etl.iloc[i,j] = sustitute[j]

Y_hitter_etl.columns = ['w_1', 'w_2', 'w_3', 'w_4', 'w_5', 'w_6', 
                        'w_7', 'w_8', 'w_9', 'w_10', 'w_11']
Y_hitter_etl[['Jugador']] = YX_hitter_panel_raw[['Jugador']]
Y_player_column_hitter = Y_hitter_etl.pop('Jugador')
Y_hitter_etl.insert(0, 'Jugador', Y_player_column_hitter)
Y_hitter_etl_copy = Y_hitter_etl.copy()
y_etl_names = Y_hitter_etl_copy['Jugador']
for i in range(0,len(y_etl_names)):
    row = Y_hitter_etl_copy.iloc[i]
    row_length = row.size
    k = 0

    sustitute = []

    for j in range(0, row_length):
        if pd.isna(row[j]) != True:
            k = k + 1

    sustitute.append(y_etl_names[i])
    for j in range(1,k - 1):
        sustitute.append(row[j + 1]**0.5 - row[j]**0.5)

    sustitute_length = len(sustitute)

    for k in range(row_length - sustitute_length):
        sustitute.append(np.nan)

    for j in range(row_length):
        Y_hitter_etl_copy.iloc[i,j] = sustitute[j]

Y_hitter_etl_copy.columns = ['Jugador', 'Y_1', 'Y_2', 'Y_3', 'Y_4', 'Y_5', 'Y_6', 
                              'Y_7', 'Y_8', 'Y_9', 'Y_10', 'Y_11']
Y_hitter_etl = Y_hitter_etl_copy

# Base de datos
regression_hitter_list = [None]*(period-1)
for i in range(0,len(regression_hitter_list)):
    merge = pd.merge(Y_hitter_etl.iloc[:,[0,i+1]], X_hitter_etl.iloc[:,[0,i+1]])
    regression_hitter_list[i] = merge
    regression_hitter_list[i].columns = ['Jugador', 'Y', 'X']
df_article_regression_hitter = regression_hitter_list[0]
for i in range(1,len(regression_hitter_list)):
    df_article_regression_hitter = pd.concat([df_article_regression_hitter, regression_hitter_list[i]])
df_article_regression_hitter = df_article_regression_hitter.sort_values(by = 'Jugador', ascending = True)
df_article_regression_hitter.reset_index(drop = True, inplace = True)
df_article_regression_hitter.dropna(inplace = True)
df_article_regression_hitter.reset_index(drop = True, inplace = True)

In [83]:
df_article_regression_hitter

Unnamed: 0,Jugador,Y,X
0,A.J. Burnett,-471.040165,-0.0
1,A.J. Pierzynski,1218.568225,-1.212678
2,A.J. Pierzynski,199.175332,-2.182821
3,A.J. Pierzynski,-2515.731853,4.123106
4,A.J. Pollock,300.847677,-2.630384
...,...,...,...
356,Zack Greinke,-8.745908,-0.0
357,Zack Greinke,-64.819869,-0.0
358,Zack Greinke,849.383256,-1.154701
359,Zack Greinke,-108.975642,-0.57735


Vemos que el experimento fue un éxito, por lo que ya podemos pasar a una función este algoritmo.

## Función de ETL

Puesto que ya funcionó el algoritmo, ahora podemos pasarlo a una función que se pueda utilizar para cualquier tipo de base de datos. Por otro lado, haremos una modificación para que regrese la base de datos generada para la regresión y a la vez que también regrese al modelo con los estimadores

In [84]:
def etl_regression(X_raw, Y_raw, df_copy):
    # Creación de la base de datos a transformar las variables de la regresión
    jugador = 'Jugador'
    YX_raw = [None]*period
    
    for year in range(0, period):
        YX_raw[year] = df_copy[year][[jugador, Y_raw, X_raw]]

    # Ajuste de inflación en los salarios    
    change_inflation_percentage = {2011:20.46, 2012:18.02, 2013:16.32, 2014:14.46,
                                   2015:14.33, 2016:12.90, 2017:10.55, 2018:7.86,
                                   2019:5.99, 2020:4.70, 2021:1}
    for year in range(0, period):
        YX_raw[year][[Y_raw]] = YX_raw[year][[Y_raw]]*(1 + change_inflation_percentage[2011 + year]*0.01)
        YX_raw[year].rename(columns = {Y_raw: Y_raw + '_' + str(2011 + year)}, inplace = True)
        YX_raw[year].rename(columns = {X_raw: X_raw + '_' + str(2011 + year)}, inplace = True)

    # Filtrado de base de datos
    YX_panel_raw = pd.merge(YX_raw[0], YX_raw[1], how = 'outer', on = 'Jugador')

    for year in range(2,period):
        YX_panel_raw = pd.merge(YX_panel_raw, YX_raw[year], how = 'outer', on = 'Jugador')

    YX_panel_raw.drop_duplicates(subset = 'Jugador', inplace = True)
    YX_panel_raw = YX_panel_raw.sort_values(by = 'Jugador', ascending = True)
    YX_panel_raw.reset_index(drop = True, inplace = True)
    YX_panel_raw.sort_index(axis = 1, inplace = True)

    # Recolocación al principio de Jugador
    player_column = YX_panel_raw.pop('Jugador')
    YX_panel_raw.insert(0, 'Jugador', player_column)

    # Creación de base de datos auxiliares  para remover los NaN
    split_column = 1 + period
    max_nan = 2

    if Y_raw > X_raw:
        X_drop_etl = YX_panel_raw.iloc[:, 1:split_column]
    else:
        X_drop_etl = YX_panel_raw.iloc[:, split_column:YX_panel_raw.shape[1]]

    nulls_quantity = period - max_nan
    X_drop_etl = X_drop_etl.iloc[X_drop_etl[(X_drop_etl.isnull().sum(axis = 1) > nulls_quantity)].index]
    X_index_drop = X_drop_etl.index

    if Y_raw > X_raw:
        Y_drop_etl = YX_panel_raw.iloc[:, split_column:YX_panel_raw.shape[1]]
    else:
        Y_drop_etl = YX_panel_raw.iloc[:, 1:split_column]
    Y_drop_etl = Y_drop_etl.iloc[Y_drop_etl[(Y_drop_etl.isnull().sum(axis = 1) > nulls_quantity)].index]
    Y_index_drop = Y_drop_etl.index

    # índice de los jugadores que no tienen más de dos observaciones en el periodo de analisis
    index_drop = list(set(X_index_drop).union(set(Y_index_drop)))

    YX_panel_raw.drop(index = index_drop, inplace = True)
    YX_panel_raw.reset_index(drop = True, inplace = True)
    YX_panel_raw = YX_panel_raw.sort_values(by = 'Jugador', ascending = True)

    # Borremos provicionalmente a los jugadores cuyo estadística máxima es igual a 0
    if Y_raw > X_raw:
        YX_panel_raw[X_raw + '_max'] = YX_panel_raw.iloc[:, 1:split_column].max(axis = 1)
    else:
        YX_panel_raw[X_raw + '_max'] = YX_panel_raw.iloc[:, split_column:YX_panel_raw.shape[1]].max(axis = 1)
    YX_max_0 = YX_panel_raw[YX_panel_raw[X_raw + '_max'] == 0]
    YX_max_0_index_drop = YX_max_0.index
    YX_panel_raw.drop(columns = X_raw + '_max', inplace = True)
    YX_panel_raw.drop(index = YX_max_0_index_drop, inplace = True)
    YX_panel_raw.reset_index(drop = True, inplace = True)

    # Variable X
    if Y_raw > X_raw:
        X_etl = YX_panel_raw.iloc[:, 1:split_column]
    else:
        X_etl = YX_panel_raw.iloc[:, split_column:YX_panel_raw.shape[1]]
    max_element = X_etl.max(axis = 1)
    min_element = X_etl.min(axis = 1)
    max_min = (max_element + min_element)/2
    dummy_names = []
    X_names = []
    for year in range(0,period):
        dummy_names.append("I_" + str(2011 + year))
        X_names.append("X_" + str(2011 + year))
    for year in range(0,period):
        X_conditions = [
        (X_etl.iloc[:,year] == max_element) & ~(X_etl.iloc[:,year].isnull()),
        (X_etl.iloc[:,year] != max_element) & ~(X_etl.iloc[:,year].isnull())
        ]
        X_conditions_values = [0,1]
        X_etl[dummy_names[year]] = np.select(X_conditions, X_conditions_values, default = np.nan)
    X_aux = X_etl.iloc[:,:period]
    Dummy = X_etl.iloc[:,period:]
    X_max = X_aux.max(axis = 1)
    X_aux[['Max']] = np.sqrt(X_max)
    for year in range(0,period):    
        X_etl['X_' + str(2011 + year)] = \
        (X_aux[X_raw + '_' + str(2011 + year)]*
        ((-1)**Dummy['I_' + str(2011 + year)]))
    X_etl = X_etl.div(X_aux['Max'].values, axis = 0)
    X_etl = X_etl.iloc[:,2*period:]
    for i in range(0,X_etl.shape[0]):
        row = X_etl.iloc[i]
        row_length = row.size
        sustitute = []

        for j in range(row_length):
            if pd.isna(row[j]) != True:
                sustitute.append(row[j])

        sustitute_length = len(sustitute)

        for k in range(row_length - sustitute_length):
            sustitute.append(np.nan)

        for j in range(row_length):
            X_etl.iloc[i,j] = sustitute[j]

    X_etl.columns = ['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 
                     'X_7', 'X_8', 'X_9', 'X_10', 'X_11']
    X_etl[['Jugador']] = YX_panel_raw[['Jugador']]
    X_player_column = X_etl.pop('Jugador')
    X_etl.insert(0, 'Jugador', X_player_column)

    # Variable Y
    if Y_raw > X_raw:
        Y_etl = YX_panel_raw.iloc[:, split_column:YX_panel_raw.shape[1]]
    else:
        Y_etl = YX_panel_raw.iloc[:, 1:split_column]
    for i in range(0,Y_etl.shape[0]):
        row = Y_etl.iloc[i]
        row_length = row.size
        sustitute = []

        for j in range(row_length):
            if pd.isna(row[j]) != True:
                sustitute.append(row[j])

        sustitute_length = len(sustitute)

        for k in range(row_length - sustitute_length):
            sustitute.append(np.nan)

        for j in range(row_length):
            Y_etl.iloc[i,j] = sustitute[j]

    Y_etl.columns = ['w_1', 'w_2', 'w_3', 'w_4', 'w_5', 'w_6', 
                     'w_7', 'w_8', 'w_9', 'w_10', 'w_11']
    Y_etl[['Jugador']] = YX_panel_raw[['Jugador']]
    Y_player_column = Y_etl.pop('Jugador')
    Y_etl.insert(0, 'Jugador', Y_player_column)
    Y_etl_copy = Y_etl.copy()
    y_etl_names = Y_etl_copy['Jugador']
    for i in range(0,len(y_etl_names)):
        row = Y_etl_copy.iloc[i]
        row_length = row.size
        k = 0

        sustitute = []

        for j in range(0, row_length):
            if pd.isna(row[j]) != True:
                k = k + 1

        sustitute.append(y_etl_names[i])
        for j in range(1,k - 1):
            sustitute.append(row[j + 1]**0.5 - row[j]**0.5)

        sustitute_length = len(sustitute)

        for k in range(row_length - sustitute_length):
            sustitute.append(np.nan)

        for j in range(row_length):
            Y_etl_copy.iloc[i,j] = sustitute[j]

    Y_etl_copy.columns = ['Jugador', 'Y_1', 'Y_2', 'Y_3', 'Y_4', 'Y_5', 'Y_6', 
                          'Y_7', 'Y_8', 'Y_9', 'Y_10', 'Y_11']
    Y_etl = Y_etl_copy

    # Base de datos
    regression_list = [None]*(period-1)
    for i in range(0,len(regression_list)):
        merge = pd.merge(Y_etl.iloc[:,[0,i+1]], X_etl.iloc[:,[0,i+1]])
        regression_list[i] = merge
        regression_list[i].columns = ['Jugador', 'Y', 'X']
    df_article_regression = regression_list[0]
    for i in range(1,len(regression_list)):
        df_article_regression = pd.concat([df_article_regression, regression_list[i]])
    df_article_regression = df_article_regression.sort_values(by = 'Jugador', ascending = True)
    df_article_regression.reset_index(drop = True, inplace = True)
    df_article_regression.dropna(inplace = True)
    df_article_regression.reset_index(drop = True, inplace = True)
    
    # Construimos la regresion
    Y = df_article_regression['Y'].tolist()
    X = df_article_regression['X'].tolist()
    X = sm.add_constant(X)
    model = sm.OLS(Y, X).fit()
    
    # Regresamos como argumento al modelo
    return [df_article_regression, model]

In [85]:
X_raw_pitcher = 'Inning_pitched'
Y_raw = 'Sueldo'
df_copy = df_pitchers_copy

In [86]:
etl_regression(X_raw_pitcher, Y_raw, df_copy)

[           Jugador           Y          X
 0     A.J. Burnett -471.040165  14.618481
 1     Aaron Harang  647.699528  13.405223
 2     Aaron Harang -138.427417 -10.689863
 3    Adam Ottavino -368.340054  -2.042177
 4    Adam Ottavino  -18.852808  -7.398706
 ..             ...         ...        ...
 420   Zack Greinke    6.483182 -13.528012
 421   Zack Greinke  -75.049658 -13.113161
 422   Zack Greinke  -64.819869 -10.286993
 423   Zack Greinke  740.407614 -13.113161
 424   Zack Greinke  754.087221  -11.51858
 
 [425 rows x 3 columns],
 <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f43c68fea90>]