# Modelo lineal del artículo

En este modelo que es de la siguiente forma:

necesitaremos la base de datos que contruimos con toda la información relevante de los equipos!

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from matplotlib.colors import ListedColormap
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import math
import os
import warnings
print('Modulos importados')

Modulos importados


In [2]:
# Configuraciones
warnings.filterwarnings('ignore')
# Reduzcamos el número de línea a leer
pd.options.display.max_rows = 15

In [3]:
# Veamos el directorio actual de trabajo
print(os.getcwd())
# El directorio anterior es el correcto, pero si no lo fuese, hacemos lo sigueinte:
path = '/home/usuario/Documentos/Github/Proyectos/MLB_HN'
os.chdir(path)

/home/usuario/Documentos/Github/Proyectos/MLB_HN/Models/Linear_models


Leamos los archivos correspondientes

In [4]:
# Años de análisis
period = 11
csv = '.csv'
# Directorios
teams_path = 'Data/New_Data/Teams/free_agents_team_'
names_path = 'Data/Teams/team_acronym'
# Originales:
names = pd.read_csv(names_path + csv)
df_team = [None]*period
df_team_copy = [None]*period
df_regression_year = [None]*(period-1)

In [5]:
for i in range(0,period):
    df_team[i] = pd.read_csv(teams_path + str(2011 + i) + csv)
    df_team_copy[i] = df_team[i].copy()

Observemos el contenido de las bases de datos

In [6]:
names.head()

Unnamed: 0,Acronimo,Equipo
0,ARI,Arizona Diamondbacks
1,ATL,Atlanta Braves
2,BAL,Baltimore Orioles
3,BOS,Boston Red Sox
4,CHC,Chicago Cubs


In [7]:
df_team_copy[9].head()

Unnamed: 0,Equipo,Cantidad_agentes_libres,Valor_contrato,Acronimo,Victorias,Juegos totales,Promedio_victorias
0,New York Yankees,2,336500000,NYY,33,60,0.55
1,Washington Nationals,10,316750000,WSH,26,62,0.419355
2,Los Angeles Angels,3,260850000,LAA,26,60,0.433333
3,Chicago White Sox,5,151500000,CHW,35,60,0.583333
4,Cincinnati Reds,4,144825000,CIN,31,59,0.525424


## Construcción de la base de datos para el modelo

### Variables endógenas

Primero, tenemos que determinar cuál es el mínimo y máximo del promedio de victorias de cada equipo a lo largo del periodo de análisis. Primero, creemos un diccionario que guarde en una lista ambos valores cuya llave asociada sea el nombre del equipo. Se tomará el promedio y no el número de victorias puesto que no todos los equipos juegan la misma cantidad de partidos en una temporada y habría fallos en el análisis en años como el 2020 por la pandemia que provocó que la cantidad de juegos por temporada fuiera mucho menor  a la usual, siendo alrededor de 60 juegos en dicha temporada.

Para crear este diccionario usaremos la base de datos de los acrónimos puesto que contiene todos los equipos que puedan haber en las bases de datos de los equipos en el periodo de análisis

In [8]:
minmax_victories = {}
length_names = len(names['Equipo'])
# CReemos las llaves con los nombres de los equipos
for i in range(0,length_names):
    minmax_victories[str(names['Equipo'].iloc[i])] = [None]*2

Como valores de comparación para las victorias usaremos cualquier base de datos que contenga la misma cantidad de filas que la base de datos *names*. Obtengamos el índice de las bases de datos que cumplan esa condición.

In [9]:
for i in range(0,period):
    if len(df_team_copy[i]['Equipo']) == length_names:
        print(i)

2
3
4
5
6
8
10


In [10]:
# Usaremos la tercera base de datos
team_name = list(minmax_victories.keys())
for i in range(length_names):
    for j in range(length_names):
        if team_name[i] == df_team_copy[2]['Equipo'].iloc[j]:
            minmax_victories[team_name[i]][0] = df_team_copy[2]['Promedio_victorias'].iloc[j]
            minmax_victories[team_name[i]][1] = df_team_copy[2]['Promedio_victorias'].iloc[j]

Ahora, recorreremos el promedio de victorias de todas las bases de datos para hallar el menor y mayor número de victorias que ha tenido cada equipo en el periodo de análisis.

In [11]:
for year in range(period):
    # Nombres de los equipos en la base de datos
    team_year_name = list(df_team_copy[year]['Equipo'])
    
    for i in range(length_names):
        # Mínimo actual
        min_victory = minmax_victories[team_name[i]][0]
        # Máximo actual:
        max_victory = minmax_victories[team_name[i]][1]
        
        for j in range(len(team_year_name)):
            
            if team_name[i] == team_year_name[j]:
                year_victory = df_team_copy[year]['Promedio_victorias'].iloc[j]
                
                # Mínimo nuevo:
                if year_victory < min_victory:
                    minmax_victories[team_name[i]][0] = year_victory
                # Máximo nuevo:
                elif year_victory > max_victory:
                    minmax_victories[team_name[i]][1] = year_victory

Los valores de la lista ahora son de la siguiente forma:

In [12]:
print(minmax_victories[team_name[1]])

[0.40625, 0.6024844720496895]


A continuación, se creará una columna que indique si la victoria de la temporada -correspondiente a la base de datos donde se situe- está más cercana del promedio más alto de victorias o del menor. 1 indicará que está más cercano a la mayor victoria, y 0 en otro caso -en caso de empate, se eligirá al 1.

In [13]:
for year in range(period):
    # Nombres de los equipos en la base de datos
    team_year_name = list(df_team_copy[year]['Equipo'])
    # COlumna dummy:
    df_team_copy[year]['Dummy'] = -1
    
    for i in range(length_names):
        # Mínimo actual
        min_victory = minmax_victories[team_name[i]][0]
        # Máximo actual:
        max_victory = minmax_victories[team_name[i]][1]
        
        for j in range(len(team_year_name)):
            
            if team_name[i] == team_year_name[j]:
                year_victory = df_team_copy[year]['Promedio_victorias'].iloc[j]
                
                # Distancia al mínimo:
                dist_min = year_victory - min_victory
                # Distancia al máximo:
                dist_max = max_victory - year_victory
                
                # Mínimo nuevo:
                if dist_min < dist_max:
                    df_team_copy[year]['Dummy'].iloc[j] = 1
                # Máximo nuevo
                else:  
                    df_team_copy[year]['Dummy'].iloc[j] = 0

In [14]:
df_team_copy[6].tail()

Unnamed: 0,Equipo,Cantidad_agentes_libres,Valor_contrato,Acronimo,Victorias,Juegos totales,Promedio_victorias,Dummy
24,Milwaukee Brewers,3,7550000,MIL,86,162,0.530864,0
25,Chicago White Sox,1,6000000,CHW,67,162,0.41358,1
26,Boston Red Sox,1,5500000,BOS,93,162,0.574074,0
27,Cincinnati Reds,2,5300000,CIN,68,162,0.419753,1
28,Detroit Tigers,1,2000000,DET,64,162,0.395062,1


Usemos la variable dummy para crear la columna que contenga el siguiente dato:

$$
\left( -1 \right)^{I_{t}^{-}}\frac{y_{t}}{\sqrt{y_H}}
$$

In [15]:
for year in range(period):
    # Nombres de los equipos en la base de datos
    team_year_name = list(df_team_copy[year]['Equipo'])
    # Columna de la nueva variable
    df_team_copy[year]['X'] = -1
    
    for i in range(length_names):        
        for j in range(len(team_year_name)):
            
            if team_name[i] == team_year_name[j]:
                # Variables auxiliares
                mean_victory = df_team_copy[year]['Promedio_victorias'].iloc[j]
                dummy = df_team_copy[year]['Dummy'].iloc[j]
                s_max_victory = np.power(minmax_victories[team_name[i]][1],0.5)
                if dummy == 0:
                    parity = 1
                else:
                    parity = -1
                
                # Variable dummy nueva
                df_team_copy[year]['X'].iloc[j] = parity*mean_victory/s_max_victory

In [16]:
df_team_copy[6].tail()

Unnamed: 0,Equipo,Cantidad_agentes_libres,Valor_contrato,Acronimo,Victorias,Juegos totales,Promedio_victorias,Dummy,X
24,Milwaukee Brewers,3,7550000,MIL,86,162,0.530864,0,0.689613
25,Chicago White Sox,1,6000000,CHW,67,162,0.41358,1,-0.541504
26,Boston Red Sox,1,5500000,BOS,93,162,0.574074,0,0.703094
27,Cincinnati Reds,2,5300000,CIN,68,162,0.419753,1,-0.542457
28,Detroit Tigers,1,2000000,DET,64,162,0.395062,1,-0.521412


In [17]:
df_team_copy[6].tail()

Unnamed: 0,Equipo,Cantidad_agentes_libres,Valor_contrato,Acronimo,Victorias,Juegos totales,Promedio_victorias,Dummy,X
24,Milwaukee Brewers,3,7550000,MIL,86,162,0.530864,0,0.689613
25,Chicago White Sox,1,6000000,CHW,67,162,0.41358,1,-0.541504
26,Boston Red Sox,1,5500000,BOS,93,162,0.574074,0,0.703094
27,Cincinnati Reds,2,5300000,CIN,68,162,0.419753,1,-0.542457
28,Detroit Tigers,1,2000000,DET,64,162,0.395062,1,-0.521412


Por otro lado, tenemos que traer los valores de contrato -en dolares- a valor presente del 2021 ajustado por la inflación. Usaremos la página [CPI Inflation Calculator](https://www.in2013dollars.com/)

Guardaremos los porcentajes de cambio de cada año en un diccionario.

In [18]:
change_inflation_percentage = {2011:20.46, 2012:18.02, 2013:16.32, 2014:14.46, 2015:14.33, 2016:12.90, 2017:10.55, 2018:7.86, 2019:5.99, 2020:4.70, 2021:1}

In [19]:
for year in range(period):
    # Nombres de los equipos en la base de datos
    team_year_name = list(df_team_copy[year]['Equipo'])
    # Columna de la nueva variable
    df_team_copy[year]['Contrato_ajustado_2021'] = -1
    
    for i in range(length_names):        
        for j in range(len(team_year_name)):
            
            if team_name[i] == team_year_name[j]:
                
                # Variables auxiliares
                df_team_copy[year]['Contrato_ajustado_2021'].iloc[j] = df_team_copy[year]['Valor_contrato'].iloc[j]*(1 + change_inflation_percentage[2011 + year]*0.01)

Veamos la nueva columna

In [20]:
df_team_copy[1].tail()

Unnamed: 0,Equipo,Cantidad_agentes_libres,Valor_contrato,Acronimo,Victorias,Juegos totales,Promedio_victorias,Dummy,X,Contrato_ajustado_2021
21,Kansas City Royals,2,5000000,KC,72,162,0.444444,1,-0.581656,5901000.0
22,San Francisco Giants,3,4800000,SF,94,162,0.580247,0,0.712844,5664960.0
23,Atlanta Braves,1,1000000,ATL,94,162,0.580247,0,0.747549,1180200.0
24,Chicago White Sox,1,900000,CHW,85,162,0.524691,0,0.686982,1062180.0
25,Houston Astros,1,750000,HOU,55,162,0.339506,1,-0.418415,885150.0


Por último, creemos la siguiente variable:

$$
Y = \sqrt{\omega_{t+1}} - \sqrt{\omega_{t}}
$$

In [21]:
for year in range(period-1):
    # Nombres de los equipos en la base de datos
    team_year_name_t = list(df_team_copy[year]['Equipo'])
    team_year_name_t_1 = list(df_team_copy[year+1]['Equipo'])
    # Columna de la nueva variable
    df_team_copy[year]['Y'] = -1
    
    for i in range(length_names):        
        for j in range(len(team_year_name_t)):
            for k in range(len(team_year_name_t_1)):
                if team_name[i] == team_year_name_t[j] and team_name[i] == team_year_name_t_1[k]:
                
                    # Variables auxiliares
                    df_team_copy[year]['Y'].iloc[j] = np.sqrt(df_team_copy[year+1]['Contrato_ajustado_2021'].iloc[k])
                    - np.sqrt(df_team_copy[year]['Contrato_ajustado_2021'].iloc[j])
    
    df_regression_year[year] = df_team_copy[year].iloc[:,[8,10]]

In [22]:
df_regression_year[1]

Unnamed: 0,X,Y
0,0.704166,13362.305190
1,0.716941,11196.601270
2,-0.592555,2987.902274
3,0.702782,4360.999885
4,-0.629194,13246.460659
...,...,...
21,-0.581656,5509.947368
22,0.712844,9691.666523
23,0.747549,9479.303772
24,0.686982,4149.139670


Ahora, uniremos las bases de datos desde el 2012 hasta el 2020

In [23]:
df_regression_all_years = df_regression_year[1]
for i in range(2,period-1):
    df_regression_all_years = df_regression_all_years.append(df_regression_year[i])

Ahora, notemos cuántas filas tiene este dataframe que se usará para la regresión

In [24]:
df_regression_all_years.shape

(255, 2)

#### Rescalado

Debido a la enorme diferencia de magnitud en las unidades de $X$ y $Y$ se usarán tres formas distintas de reescalado:
- Normalización:
$$
Y_{norm} = \frac{Y - Y_{min}}{Y_{max} - Y_{min}}
$$

- Estandarización:
$$
Y_{stdr} = \frac{Y - \bar{Y}}{\sigma (Y)}
$$

- Propia:
$$
Y_{propia} = \frac{Y}{Y_{max}}
$$

Crearemos tres bases de datos para cada reescalado:

In [25]:
# Normalization
df_normalization = df_regression_all_years.copy()
Y_norm_max = df_normalization['Y'].max()
Y_norm_min = df_normalization['Y'].min()
df_normalization['Y_norm'] = (df_normalization['Y'] - Y_norm_min)/(Y_norm_max - Y_norm_min)
df_normalization.head()

Unnamed: 0,X,Y,Y_norm
0,0.704166,13362.30519,0.646559
1,0.716941,11196.60127,0.541775
2,-0.592555,2987.902274,0.144612
3,0.702782,4360.999885,0.211047
4,-0.629194,13246.460659,0.640954


In [26]:
# Estandarización
df_standarization = df_regression_all_years.copy()
Y_mean = df_standarization['Y'].mean()
Y_stdr = df_standarization['Y'].std()
df_standarization['Y_stdr'] = (df_standarization['Y'] - Y_mean)/Y_stdr
df_standarization.head()

Unnamed: 0,X,Y,Y_stdr
0,0.704166,13362.30519,1.546932
1,0.716941,11196.60127,1.03643
2,-0.592555,2987.902274,-0.898532
3,0.702782,4360.999885,-0.574864
4,-0.629194,13246.460659,1.519625


In [27]:
# Propia
df_propia = df_regression_all_years.copy()
Y_own_max = df_propia['Y'].max()
df_propia['Y_propia'] = df_propia['Y']/Y_own_max 
df_propia.head()

Unnamed: 0,X,Y,Y_propia
0,0.704166,13362.30519,0.646542
1,0.716941,11196.60127,0.541753
2,-0.592555,2987.902274,0.144571
3,0.702782,4360.999885,0.211009
4,-0.629194,13246.460659,0.640936


### Variables exógenas (control)

Nos restringiremos a la base de datos que contenga solo información de los agentes libres

In [28]:
# Directorios
free_agent_batters = 'Data/New_Data/Hitters/Free_Agent/free_agents_batters_'
free_agent_pitchers = 'Data/New_Data/Pitchers/Free_Agent/free_agents_pitchers_'
# Bases de datos
batters = [None]*period
pitchers = [None]*period
control = [None]*period
standaryzed_control = [None]*period
# Copias
batters_copy = [None]*period
pitchers_copy = [None]*period

In [29]:
for i in range(0,period):
    batters[i] = pd.read_csv(free_agent_batters + str(2011 + i) + csv)
    pitchers[i] = pd.read_csv(free_agent_pitchers + str(2011 + i) + csv)
    
    batters_copy[i] = batters[i].copy()
    pitchers_copy[i] = pitchers[i].copy()

Para crear la base de datos de las variables de control, solo nos quedaremos con el equipo como varibale de clasificación. Veamos con qué columnas nos quedamos

In [30]:
batters_copy[5].head()

Unnamed: 0,Jugador,Valor_contrato,Valor_promedio_contrato,Posicion,Juegos,Porcetnaje_juegos,At-bats,Bateos,Home-runs,RBI,Porcentaje_bateo,OPS,Equipo,Sueldo
0,David Price,217000000,31000000,SP,35,0.216,10,0,0,0,0.0,0.091,BOS,30000000
1,Zack Greinke,206500000,34416667,SP,27,0.167,52,11,0,3,0.212,0.476,ARI,34000000
2,Jason Heyward,184000000,23000000,RF,142,0.877,530,122,7,49,0.23,0.631,CHC,21666666
3,Chris Davis,161000000,23000000,1B,157,0.969,566,125,38,84,0.221,0.792,BAL,21065362
4,Justin Upton,132750000,22125000,LF,153,0.95,570,140,31,87,0.246,0.775,DET,22125000


In [31]:
batters_copy[5].iloc[:,[i for i in range(5,13)]]

Unnamed: 0,Porcetnaje_juegos,At-bats,Bateos,Home-runs,RBI,Porcentaje_bateo,OPS,Equipo
0,0.216,10,0,0,0,0.000,0.091,BOS
1,0.167,52,11,0,3,0.212,0.476,ARI
2,0.877,530,122,7,49,0.230,0.631,CHC
3,0.969,566,125,38,84,0.221,0.792,BAL
4,0.950,570,140,31,87,0.246,0.775,DET
...,...,...,...,...,...,...,...,...
103,0.148,37,2,0,2,0.054,0.179,SD
104,0.401,185,41,6,26,0.222,0.632,NYM
105,0.105,31,6,0,3,0.194,0.470,CHC
106,0.148,1,0,0,0,0.000,0.000,CHW


In [32]:
pitchers_copy[5].head()

Unnamed: 0,Jugador,Valor_contrato,Valor_promedio_contrato,Posicion,Juegos,Juegos_iniciados,Inning_pitched,Bateos_pitcher,Carreras,Carreras_ganadas,Walks,Strike-outs,Wins,Losses,Saves,WHIP,ERA,Equipo,Sueldo
0,David Price,217000000,31000000,SP,35,35,230.0,227,106,102,50,228,17,9,0,1.2,3.99,BOS,30000000
1,Zack Greinke,206500000,34416667,SP,27,26,158.7,161,80,77,41,134,13,7,0,1.27,4.37,ARI,34000000
2,Johnny Cueto,130000000,21666667,SP,32,32,219.7,195,71,68,45,198,18,5,0,1.09,2.79,SF,17500000
3,Jordan Zimmermann,110000000,22000000,SP,19,18,105.3,118,63,57,26,66,9,7,0,1.37,4.87,DET,18000000
4,Jeff Samardzija,90000000,18000000,SP,34,32,203.3,190,88,86,54,167,12,11,0,1.2,3.81,SF,10800000


In [33]:
pitchers_copy[5].iloc[:,[i for i in range(4,18)]]

Unnamed: 0,Juegos,Juegos_iniciados,Inning_pitched,Bateos_pitcher,Carreras,Carreras_ganadas,Walks,Strike-outs,Wins,Losses,Saves,WHIP,ERA,Equipo
0,35,35,230.0,227,106,102,50,228,17,9,0,1.20,3.99,BOS
1,27,26,158.7,161,80,77,41,134,13,7,0,1.27,4.37,ARI
2,32,32,219.7,195,71,68,45,198,18,5,0,1.09,2.79,SF
3,19,18,105.3,118,63,57,26,66,9,7,0,1.37,4.87,DET
4,34,32,203.3,190,88,86,54,167,12,11,0,1.20,3.81,SF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,57,0,46.7,40,20,19,22,41,1,6,1,1.33,3.66,BOS
71,24,23,129.3,131,74,69,52,100,5,12,0,1.41,4.80,SD
72,44,0,35.0,35,17,17,14,40,2,2,0,1.40,4.37,NYM
73,24,23,135.0,132,61,55,35,95,5,8,0,1.24,3.67,CHW


Los agruparemos por equipo bajo el promedio de las estaísticas

In [34]:
for i in range(0,period):
    batters_cut = batters_copy[i].iloc[:,[i for i in range(5,13)]]
    pitchers_cut = pitchers_copy[i].iloc[:,[i for i in range(4,18)]]
    
    pitchers_cut.iloc[:,0:-2] = pitchers_cut.iloc[:,0:-2].astype(float)
    
    batters_group = batters_cut.groupby(by=["Equipo"]).mean()
    pitchers_group = pitchers_cut.groupby(by=["Equipo"]).mean()
    
    control[i] = pd.merge(batters_group, pitchers_group, on = 'Equipo')

In [35]:
control[4]

Unnamed: 0_level_0,Porcetnaje_juegos,At-bats,Bateos,Home-runs,RBI,Porcentaje_bateo,OPS,Juegos,Juegos_iniciados,Inning_pitched,Bateos_pitcher,Carreras,Carreras_ganadas,Walks,Strike-outs,Wins,Losses,Saves,WHIP,ERA
Equipo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
ATL,0.638000,340.000000,101.333333,4.000000,34.000000,0.199333,0.506667,27.750000,1.750000,30.100000,35.250000,19.750000,18.000000,13.750000,26.500000,1.000000,2.500000,6.000000,1.5425,5.390000
BAL,0.377000,170.000000,42.000000,6.000000,20.000000,0.247000,0.738000,36.000000,0.000000,41.300000,44.000000,19.000000,19.000000,17.000000,38.000000,4.000000,2.000000,0.000000,1.4800,4.140000
BOS,0.713000,435.500000,108.000000,14.500000,50.500000,0.248000,0.690500,37.666667,2.000000,53.100000,47.333333,22.333333,21.333333,18.666667,45.000000,1.666667,2.000000,0.333333,1.1300,3.233333
CHC,0.258667,73.444444,15.000000,0.777778,5.555556,0.155000,0.402111,49.285714,22.857143,73.128571,68.571429,32.571429,30.285714,17.285714,67.285714,4.857143,3.571429,0.857143,0.8800,2.888571
CHW,0.712750,363.250000,86.750000,9.750000,40.500000,0.228750,0.666000,61.400000,23.000000,40.000000,33.200000,15.600000,13.200000,13.400000,41.600000,2.400000,2.400000,7.000000,0.9520,2.284000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SF,0.251000,24.666667,4.000000,0.666667,1.666667,0.111000,0.320667,32.250000,10.250000,76.675000,72.750000,35.750000,33.750000,23.500000,65.500000,4.250000,5.500000,0.500000,1.0500,4.035000
STL,0.540000,195.000000,44.000000,6.500000,24.000000,0.115000,0.356500,34.500000,0.000000,47.350000,42.000000,15.500000,15.000000,18.000000,40.000000,2.500000,2.000000,1.000000,1.3100,2.810000
TEX,0.220333,67.000000,15.333333,1.333333,6.666667,0.167333,0.460000,27.000000,16.500000,112.000000,116.000000,61.000000,57.000000,24.500000,80.500000,10.000000,5.000000,0.500000,1.3450,4.190000
TOR,0.724333,303.000000,73.333333,14.666667,54.000000,0.246333,0.749333,57.000000,0.000000,55.000000,46.000000,15.000000,12.000000,12.000000,61.000000,1.000000,3.000000,1.000000,1.0500,1.960000


Dividamos todas las columnas entre el máximo de la columna para obtener datos estandarizados con respecto a la unidad

In [36]:
for year in range(0,period):
    # Nombres de los equipos en la base de datos
    column_name = control[year].columns
    standaryzed_control[year] = control[year][column_name]
    
    for i in column_name:        
        standaryzed_control[year][column_name] = control[year][column_name]/control[year][column_name].max()

In [37]:
standaryzed_control[2]

Unnamed: 0_level_0,Porcetnaje_juegos,At-bats,Bateos,Home-runs,RBI,Porcentaje_bateo,OPS,Juegos,Juegos_iniciados,Inning_pitched,Bateos_pitcher,Carreras,Carreras_ganadas,Walks,Strike-outs,Wins,Losses,Saves,WHIP,ERA
Equipo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
ARI,0.581510,0.505859,0.470982,0.315000,0.464286,0.451431,0.561686,0.289474,0.666667,0.637695,0.682203,0.717172,0.715789,0.280000,0.506667,0.277778,0.846154,0.000000,0.835052,0.722488
BAL,0.522701,0.495660,0.425595,0.303333,0.261905,0.286980,0.375405,0.519737,0.454545,0.539443,0.425847,0.510101,0.484211,0.466667,0.620000,0.416667,0.538462,0.238095,0.736082,0.523126
BOS,0.872467,0.891741,0.826531,1.000000,1.000000,0.513292,0.730692,0.960526,0.000000,0.350968,0.139831,0.101010,0.094737,0.120000,0.673333,0.222222,0.076923,1.000000,0.346392,0.173844
CHC,0.513997,0.404080,0.351190,0.478333,0.366071,0.360600,0.472018,0.355263,0.494949,0.502283,0.456215,0.585859,0.564912,0.457778,0.542222,0.277778,0.666667,0.000000,0.731959,0.671983
CHW,0.087509,0.046875,0.008929,0.000000,0.035714,0.114519,0.204594,1.000000,0.000000,0.286726,0.271186,0.232323,0.221053,0.306667,0.306667,0.111111,0.307692,0.000000,0.884536,0.497608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SEA,0.621030,0.472656,0.404018,0.542500,0.352679,0.261247,0.334833,0.434211,0.984848,0.951110,0.870763,0.939394,0.905263,0.686667,0.973333,0.694444,0.846154,0.000000,0.807216,0.631579
SF,0.862385,1.000000,1.000000,0.245000,0.544643,0.592025,0.698556,0.513158,0.000000,0.159188,0.114407,0.141414,0.147368,0.226667,0.140000,0.055556,0.384615,0.000000,0.810309,0.596491
TB,0.890967,0.815755,0.752232,0.700000,0.736607,0.406442,0.522851,0.780702,0.252525,0.460242,0.384181,0.481481,0.466667,0.422222,0.560000,0.203704,0.589744,0.031746,0.758763,0.605529
TEX,0.756057,0.711806,0.654762,0.863333,0.672619,0.542604,0.764386,0.572368,0.000000,0.171705,0.114407,0.126263,0.126316,0.226667,0.253333,0.138889,0.115385,0.000000,0.770103,0.507974


## Regresiones

### Lineal -artículo-

Se calcularán por OLS para los distintos tipos de reescalado

#### Normalización

In [38]:
y_norm = df_normalization['Y_norm'].tolist()
x_norm = df_normalization['X'].tolist()

x_norm = sm.add_constant(x_norm)

linear_model_norm = sm.OLS(y_norm, x_norm).fit()

#### Estandarización

In [39]:
y_stdr = df_standarization['Y_stdr'].tolist()
x_stdr = df_standarization['X'].tolist()

x_stdr = sm.add_constant(x_stdr)

linear_model_stdr = sm.OLS(y_stdr, x_stdr).fit()

In [40]:
linear_model_stdr.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.008
Method:,Least Squares,F-statistic:,3.002
Date:,"Tue, 05 Jul 2022",Prob (F-statistic):,0.0844
Time:,17:10:48,Log-Likelihood:,-359.82
No. Observations:,255,AIC:,723.6
Df Residuals:,253,BIC:,730.7
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0095,0.063,-0.152,0.879,-0.133,0.114
x1,0.1670,0.096,1.733,0.084,-0.023,0.357

0,1,2,3
Omnibus:,33.323,Durbin-Watson:,1.831
Prob(Omnibus):,0.0,Jarque-Bera (JB):,42.166
Skew:,0.96,Prob(JB):,6.98e-10
Kurtosis:,3.529,Cond. No.,1.55


#### Propia

In [41]:
y_propia = df_propia['Y_propia'].tolist()
x_propia = df_propia['X'].tolist()

x_propia = sm.add_constant(x_propia)

linear_model_propia = sm.OLS(y_propia, x_propia).fit()

In [42]:
linear_model_propia.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.008
Method:,Least Squares,F-statistic:,3.002
Date:,"Tue, 05 Jul 2022",Prob (F-statistic):,0.0844
Time:,17:10:48,Log-Likelihood:,43.955
No. Observations:,255,AIC:,-83.91
Df Residuals:,253,BIC:,-76.83
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3271,0.013,25.444,0.000,0.302,0.352
x1,0.0343,0.020,1.733,0.084,-0.005,0.073

0,1,2,3
Omnibus:,33.323,Durbin-Watson:,1.831
Prob(Omnibus):,0.0,Jarque-Bera (JB):,42.166
Skew:,0.96,Prob(JB):,6.98e-10
Kurtosis:,3.529,Cond. No.,1.55


#### Sin reescalar

In [43]:
y_no_scale = df_regression_all_years['Y'].tolist()
x_no_scale = df_regression_all_years['X'].tolist()

x_no_scale = sm.add_constant(x_no_scale)

linear_model_noscale = sm.OLS(y_no_scale, x_no_scale).fit()

In [44]:
linear_model_noscale.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.008
Method:,Least Squares,F-statistic:,3.002
Date:,"Tue, 05 Jul 2022",Prob (F-statistic):,0.0844
Time:,17:10:48,Log-Likelihood:,-2489.8
No. Observations:,255,AIC:,4984.0
Df Residuals:,253,BIC:,4991.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6759.2834,265.651,25.444,0.000,6236.114,7282.453
x1,708.4054,408.869,1.733,0.084,-96.815,1513.626

0,1,2,3
Omnibus:,33.323,Durbin-Watson:,1.831
Prob(Omnibus):,0.0,Jarque-Bera (JB):,42.166
Skew:,0.96,Prob(JB):,6.98e-10
Kurtosis:,3.529,Cond. No.,1.55


### Naives Bayes

Se hará análogo al caso OLS

#### Normalización

In [45]:
y_norm = df_normalization[['Y_norm']].values
x_norm = df_normalization[['X']].values

y_norm = y_norm.astype('int')

X_train, X_test, y_train, y_test = train_test_split(x_norm, y_norm, test_size = 0.2, random_state = 0)

classifier_nv_norm = GaussianNB()
classifier_nv_norm.fit(X_train, y_train)

naive_bayes_model_norm = classifier_nv_norm.predict(X_test)

y_pred = classifier_nv_norm.predict(X_test)

confussion_matrix_nv_norm = confusion_matrix(y_test, y_pred)

accuracy_scores_train_nv_norm = classifier_nv_norm.score(X_train, y_train)
accuracy_scores_test_nv_norm = classifier_nv_norm.score(X_test, y_test)
print(accuracy_scores_train_nv_norm)
print(accuracy_scores_test_nv_norm)

1.0
1.0


In [46]:
confussion_matrix_nv_norm

array([[51]])

#### Estandarización

In [47]:
y_stdr = df_standarization[['Y_stdr']].values
x_stdr = df_standarization[['X']].values

y_stdr = y_stdr.astype('int')

X_train, X_test, y_train, y_test = train_test_split(x_stdr, y_stdr, test_size = 0.25, random_state = 0)

classifier_nv_stdr = GaussianNB()
classifier_nv_stdr.fit(X_train, y_train)

naive_bayes_stdr = classifier_nv_stdr.predict(X_test)

y_pred = classifier_nv_stdr.predict(X_test)

confussion_matrix_nv_stdr = confusion_matrix(y_test, y_pred)

accuracy_scores_train_nv_stdr = classifier_nv_stdr.score(X_train, y_train)
accuracy_scores_test_nv_stdr = classifier_nv_stdr.score(X_test, y_test)
print(accuracy_scores_train_nv_stdr)
print(accuracy_scores_test_nv_stdr)

0.6910994764397905
0.796875


In [48]:
confussion_matrix_nv_stdr

array([[ 0,  8,  0,  0],
       [ 0, 51,  0,  0],
       [ 0,  3,  0,  0],
       [ 0,  2,  0,  0]])

#### Propia

In [49]:
y_propia = df_propia[['Y_propia']].values
x_propia = df_propia[['X']].values

y_propia = y_propia.astype('int')

X_train, X_test, y_train, y_test = train_test_split(x_propia, y_propia, test_size = 0.25, random_state = 0)

classifier_nv_propia = GaussianNB()
classifier_nv_propia.fit(X_train, y_train)

naive_bayes_propia = classifier_nv_propia.predict(X_test)

y_pred = classifier_nv_propia.predict(X_test)

confussion_matrix_nv_propia = confusion_matrix(y_test, y_pred)

accuracy_scores_train_nv_norm = classifier_nv_propia.score(X_train, y_train)
accuracy_scores_test_nv_norm = classifier_nv_propia.score(X_test, y_test)
print(accuracy_scores_train_nv_stdr)
print(accuracy_scores_test_nv_stdr)

0.6910994764397905
0.796875


In [50]:
confussion_matrix_nv_propia

array([[64]])

## Variables instrumentales

Para determinar qué variables usar como control, obtendremos la matriz de correlación. Uniremos todas las bases de datos correspondientes a las estadísticas deportivas halladas en la base de datos de las variables de control.