In [1]:
import os.path
import pandas as pd

import numpy as np
from numpy import * 

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

### 1.Preparacion de los datos

#### Lee el dataset que generamos anteriormente 

In [2]:
#Lee el excel generado en el otro documento .ipynb
path = "../DecatlonEstadistics/resources/data.xlsx"
norm_path = os.path.normpath(path) 
ddbb = pd.read_excel(norm_path)

del ddbb['1500m NF']

ddbb['Country']=ddbb['Country'].str.lower()

ddbb.columns = ['Position', 'Athlete', 'Age', 'Country', 'Total Points', 'Year', 'Competition', '100m', 
                 '100m Points', 'Lj', 'Lj Points', 'Sp', 'Sp Points', 'Hj', 'Hj Points', '400m', '400m Points', 
                 '110m H', '110m H Points', 'Dt', 'Dt Points', 'Pv', 'Pv Points', 'Jt', 'Jt Points', 
                 '1500m Points', '1500m']

ddbbData = ddbb[['100m','100m Points','Lj','Lj Points','Sp','Sp Points','Hj','Hj Points','400m Points','400m',
                 '110m H','110m H Points','Dt','Dt Points','Pv','Pv Points','Jt','Jt Points','1500m Points','1500m']]


#### Normaliza los datos del dataset entre [0,1]

In [3]:
#Normaliza los datos
normalizer = preprocessing.MinMaxScaler()
ddbbNormalized = normalizer.fit_transform(ddbbData)

#Introduce los datos normalizados en un nuevo en un dataframe
ddbbDataNorm = pd.DataFrame(ddbbNormalized)
ddbbDataNorm.columns  = ['100m','100m Points','Lj','Lj Points','Sp','Sp Points','Hj','Hj Points','400m Points','400m','110m H',
                '110m H Points','Dt','Dt Points','Pv','Pv Points','Jt','Jt Points','1500m Points','1500m']

#### Algoritmos y metricas que se van a estudiar

In [4]:
metricas = {
    'Accuracy': metrics.accuracy_score,
    'MAE':  metrics.mean_absolute_error,
    'RMSE': metrics.mean_squared_error,
    #Actualmente esta funcion solo esta incluida en versiones no estables de la libreria ScikitLearn, por eso se crea esta funcion
    'MAPE': lambda y, y_pred:
          np.mean(np.abs((y - y_pred) / y)) * 100,
}

In [5]:
#Diccionario con los algoritmos que se van a estudiar
algoritmos = {
    'KNN': KNeighborsClassifier(n_neighbors=5),
    #'REGLI': linear_model.LinearRegression(),
    'REGLO': linear_model.LogisticRegression(random_state=42),
    'GNB': GaussianNB(),
    #'KMEANS': KMeans(n_clusters=1 , random_state= 5),
    'PERCEPTRON': Perceptron(tol=1e-5, random_state=1),
    'MLP': MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1),
    #'SVM': svm.SVC(kernel='linear'),
    'DESGRA': SGDClassifier(loss="hinge", penalty="l2"),
    'ARBOL': DecisionTreeClassifier(criterion="entropy", max_depth=5),
    'BOSQUE': RandomForestClassifier(n_estimators=100)
}

In [6]:
algoritmosClust = {
    
}

In [7]:
algoritmosIA = {
    
}

### 4.Hold Out

#### Metricas para los diferentes algoritmos -> Separacion de datos HoldOut

Funcion que genera las etiquetas predecidas para los diferentes algoritmos

In [8]:
#Funcion que genera predicciones para casos de test en función de la separación hecha por holdOut
def generaModelosHO(nombre,X_train, X_test, y_train, y_test):
    
    #Estudia el algoritmo pasado
    algoritmo = algoritmos[nombre]

    #Entrena el modelo
    algoritmo.fit(X_train, y_train)

    #Predice para los datos de test
    y_pred = algoritmo.predict(X_test)
    
    #Devuelve el las predicciones para los casos de test
    return(y_pred)

Funcion que devuelve para un algoritmo (Se le pasa las etiquetas reales y las generadas por el algoritmo) dado todas sus metricas.

Se usa tanto para Hold Out como Cross Validation

In [9]:
#Funcion para la evaluacion de las diferentes metricas
def evaluaMetricas(y_test, y_pred):
    resultado = {}
    for nombre, funcion in metricas.items():
        resultado[nombre] = funcion(y_test, y_pred)
    return resultado

Funcion que devuelve el resultado de las metricas para los algoritmos definidos en la parte superior

In [10]:
def mainHoldOut(ddbbData):
    y_pred = {}
    X_train, X_test, y_train, y_test = train_test_split(ddbbData, ddbb['Position'], test_size=0.3, random_state=42)

    for nombre, funcion in algoritmos.items():
        y_pred[nombre] = generaModelosHO(nombre,X_train, X_test, y_train, y_test)
        
    resultado = {}
    for nombre, funcionA in algoritmos.items():
        resultado[nombre] = evaluaMetricas(y_test, y_pred[nombre])
    
    #Tabla de metricas para los algoritmos estudiados
    return resultado

### 5.Cross Validation

#### Metricas para los diferentes algoritmos -> Separacion de datos CrossValidation

In [11]:
def generaModelosCV(funcion, ddbbData):
    y_pred = cross_val_predict(funcion, ddbbData, ddbb['Position'], cv=KFold(n_splits=10,random_state=42,shuffle=True))
    
    return y_pred

In [12]:
def mainCrossVal(ddbbData):
    y_pred = {}

    for nombre, funcion in algoritmos.items():
        y_pred[nombre] = generaModelosCV(funcion, ddbbData)
        
    resultado = {}
    for nombre, funcionA in algoritmos.items():
        resultado[nombre] = evaluaMetricas(ddbb['Position'], y_pred[nombre])
    
    #Tabla de metricas para los algoritmos estudiados
    return resultado

### 6.Comparativas de las metricas

#### Resultados de las metricas para HoldOut y CrossValidation de cada prueba

In [13]:
#PRUEBA -> 100m Lisos
ddbbData = ddbbDataNorm[['100m','100m Points']]

ho = mainHoldOut(ddbbData)
cv = mainCrossVal(ddbbData)

tabla=pd.DataFrame(ho)
tabla1=pd.DataFrame(cv)

print()
print()
print('METRICAS HOLD OUT-> 100m lisos')
tabla



METRICAS HOLD OUT-> 100m lisos


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.058824,0.047059,0.066667,0.027451,0.07451,0.043137,0.054902,0.066667
MAE,7.788235,7.34902,7.45098,7.890196,6.207843,7.070588,6.552941,7.015686
RMSE,101.058824,84.556863,88.196078,88.752941,61.156863,78.694118,70.623529,85.886275
MAPE,86.358991,74.873128,82.780491,207.012584,69.490304,84.703543,77.088757,114.83585


In [14]:
print()
print()
print('METRICAS CROSS VALIDATION-> 100m lisos')
tabla1



METRICAS CROSS VALIDATION-> 100m lisos


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.061176,0.052941,0.044706,0.050588,0.063529,0.037647,0.055294,0.06
MAE,7.488235,7.623529,8.328235,7.232941,6.145882,8.458824,6.698824,6.705882
RMSE,93.455294,93.148235,110.864706,82.618824,60.814118,118.543529,72.729412,75.24
MAPE,78.154379,74.269367,91.62134,122.353372,74.896163,142.698887,86.977506,99.284833


In [15]:
#PRUEBA -> Salto de longitud
ddbbData = ddbbDataNorm[['Lj','Lj Points']]

ho = mainHoldOut(ddbbData)
cv = mainCrossVal(ddbbData)

tabla=pd.DataFrame(ho)
tabla1=pd.DataFrame(cv)

print()
print()
print('METRICAS HOLD OUT-> Salto de longitud')
tabla



METRICAS HOLD OUT-> Salto de longitud


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.039216,0.027451,0.062745,0.039216,0.047059,0.043137,0.05098,0.031373
MAE,7.329412,6.745098,7.74902,8.243137,6.223529,7.976471,6.67451,7.670588
RMSE,94.654902,69.639216,100.988235,104.698039,60.443137,92.235294,73.670588,90.572549
MAPE,67.466048,95.264441,96.821514,226.763157,77.942505,212.487947,100.364,119.200661


In [16]:
print()
print()
print('METRICAS CROSS VALIDATION-> Salto de longitud')
tabla1



METRICAS CROSS VALIDATION-> Salto de longitud


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.042353,0.021176,0.058824,0.029412,0.048235,0.029412,0.049412,0.047059
MAE,7.612941,7.808235,7.274118,9.664706,6.176471,7.631765,6.356471,7.255294
RMSE,94.9,100.452941,88.824706,137.707059,58.588235,90.587059,65.205882,84.450588
MAPE,75.874096,86.008651,101.164064,214.767774,94.218413,144.884134,83.481721,111.842709


In [17]:
#PRUEBA -> Lanzamiento de peso
ddbbData = ddbbDataNorm[['Sp','Sp Points']]

ho = mainHoldOut(ddbbData)
cv = mainCrossVal(ddbbData)

tabla=pd.DataFrame(ho)
tabla1=pd.DataFrame(cv)

print()
print()
print('METRICAS HOLD OUT-> Lanzamiento de peso')
tabla



METRICAS HOLD OUT-> Lanzamiento de peso


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.027451,0.023529,0.066667,0.054902,0.054902,0.023529,0.047059,0.023529
MAE,8.376471,6.541176,8.447059,7.435294,6.388235,6.254902,6.52549,7.254902
RMSE,108.243137,66.298039,119.152941,79.333333,65.133333,58.270588,67.984314,81.803922
MAPE,81.22571,83.096114,101.298076,193.675202,70.512584,111.551361,85.060231,120.063641


In [18]:
print()
print()
print('METRICAS CROSS VALIDATION-> Lanzamiento de peso')
tabla1



METRICAS CROSS VALIDATION-> Lanzamiento de peso


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.032941,0.042353,0.055294,0.042353,0.052941,0.027059,0.048235,0.028235
MAE,8.209412,7.507059,7.911765,8.092941,6.774118,8.177647,7.018824,7.858824
RMSE,105.287059,91.172941,103.843529,98.278824,72.515294,110.452941,78.609412,96.811765
MAPE,89.401403,69.489312,100.060001,166.723665,89.580605,139.146857,95.54433,139.286266


In [19]:
#PRUEBA -> Salto de altura
ddbbData = ddbbDataNorm[['Hj','Hj Points']]

ho = mainHoldOut(ddbbData)
cv = mainCrossVal(ddbbData)

tabla=pd.DataFrame(ho)
tabla1=pd.DataFrame(cv)

print()
print()
print('METRICAS HOLD OUT-> Salto de altura')
tabla



METRICAS HOLD OUT-> Salto de altura


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.043137,0.035294,0.058824,0.035294,0.047059,0.027451,0.05098,0.031373
MAE,7.980392,6.705882,8.952941,7.945098,6.74902,7.486275,6.486275,7.164706
RMSE,100.929412,69.364706,133.368627,89.952941,71.768627,79.243137,66.933333,78.796078
MAPE,84.434032,91.598992,119.788708,208.038339,83.616614,158.248671,88.866157,129.170147


In [20]:
print()
print()
print('METRICAS CROSS VALIDATION-> Salto de altura')
tabla1



METRICAS CROSS VALIDATION-> Salto de altura


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.050588,0.023529,0.043529,0.032941,0.030588,0.035294,0.050588,0.031765
MAE,8.242353,7.528235,9.116471,9.269412,6.574118,9.488235,7.327059,7.187059
RMSE,110.771765,91.323529,130.194118,135.681176,71.08,143.492941,88.837647,82.201176
MAPE,88.207028,82.021577,121.436286,193.645145,92.656996,183.158153,98.988892,116.988611


In [21]:
#PRUEBA -> 400m lisos
ddbbData = ddbbDataNorm[['400m','400m Points']]

ho = mainHoldOut(ddbbData)
cv = mainCrossVal(ddbbData)

tabla=pd.DataFrame(ho)
tabla1=pd.DataFrame(cv)

print()
print()
print('METRICAS HOLD OUT-> 400m lisos')
tabla



METRICAS HOLD OUT-> 400m lisos


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.062745,0.039216,0.054902,0.054902,0.054902,0.058824,0.054902,0.062745
MAE,8.32549,7.011765,8.847059,7.423529,6.537255,6.466667,6.905882,7.501961
RMSE,115.486275,78.509804,125.07451,79.133333,69.196078,69.509804,76.2,89.164706
MAPE,89.510345,79.901875,111.227651,193.626183,84.029977,75.68659,99.380141,119.174583


In [22]:
print()
print()
print('METRICAS CROSS VALIDATION-> 400m lisos')
tabla1



METRICAS CROSS VALIDATION-> 400m lisos


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.058824,0.037647,0.045882,0.035294,0.042353,0.029412,0.049412,0.064706
MAE,7.948235,8.051765,8.827059,9.4,6.824706,8.912941,7.222353,7.761176
RMSE,104.637647,103.562353,124.991765,143.157647,74.478824,127.127059,84.438824,93.702353
MAPE,82.957247,77.129255,104.341579,182.579293,78.042791,155.094643,99.181411,131.102164


In [23]:
#PRUEBA -> 110m Vayas
ddbbData = ddbbDataNorm[['110m H','110m H Points']]

ho = mainHoldOut(ddbbData)
cv = mainCrossVal(ddbbData)

tabla=pd.DataFrame(ho)
tabla1=pd.DataFrame(cv)

print()
print()
print('METRICAS HOLD OUT-> 110m Vayas')
tabla



METRICAS HOLD OUT-> 110m Vayas


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.035294,0.054902,0.054902,0.05098,0.039216,0.035294,0.035294,0.043137
MAE,8.078431,7.333333,8.792157,7.454902,6.52549,6.690196,6.694118,6.658824
RMSE,104.619608,85.058824,117.372549,79.384314,62.27451,65.662745,74.419608,71.011765
MAPE,80.179049,79.171811,137.142894,193.898474,101.476583,113.452814,90.974987,112.061725


In [24]:
print()
print()
print('METRICAS CROSS VALIDATION-> 110m Vayas')
tabla1



METRICAS CROSS VALIDATION-> 110m Vayas


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.052941,0.036471,0.042353,0.035294,0.043529,0.04,0.036471,0.054118
MAE,7.78,8.228235,9.642353,10.44,6.916471,7.277647,7.638824,7.023529
RMSE,99.864706,107.112941,142.875294,160.929412,76.161176,83.082353,95.095294,80.284706
MAPE,81.968251,80.316425,117.806204,197.081059,83.830852,112.0652,94.296301,106.666393


In [25]:
#PRUEBA -> Lanzamiento de Disco
ddbbData = ddbbDataNorm[['Dt','Dt Points']]

ho = mainHoldOut(ddbbData)
cv = mainCrossVal(ddbbData)

tabla=pd.DataFrame(ho)
tabla1=pd.DataFrame(cv)

print()
print()
print('METRICAS HOLD OUT-> Lanzamiento de Disco')
tabla



METRICAS HOLD OUT-> Lanzamiento de Disco


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.043137,0.039216,0.043137,0.054902,0.058824,0.047059,0.039216,0.062745
MAE,8.188235,6.607843,7.611765,7.447059,6.411765,8.352941,6.509804,7.235294
RMSE,108.760784,68.694118,94.301961,79.713725,67.996078,112.8,68.713725,86.513725
MAPE,82.187038,82.915146,85.876683,194.565725,78.700147,73.23342,85.053909,121.819234


In [26]:
print()
print()
print('METRICAS CROSS VALIDATION-> Lanzamiento de Disco')
tabla1



METRICAS CROSS VALIDATION-> Lanzamiento de Disco


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.050588,0.028235,0.043529,0.035294,0.058824,0.037647,0.058824,0.031765
MAE,7.997647,7.641176,8.164706,9.128235,6.482353,7.789412,6.543529,7.577647
RMSE,102.950588,95.22,108.152941,133.363529,70.442353,94.045882,71.343529,91.761176
MAPE,81.659991,75.268792,106.006068,165.534704,84.562062,131.348105,88.992427,131.388356


In [27]:
#PRUEBA -> Salto de Pertiga
ddbbData = ddbbDataNorm[['Pv','Pv Points']]

ho = mainHoldOut(ddbbData)
cv = mainCrossVal(ddbbData)

tabla=pd.DataFrame(ho)
tabla1=pd.DataFrame(cv)

print()
print()
print('METRICAS HOLD OUT-> Salto de Pertiga')
tabla



METRICAS HOLD OUT-> Salto de Pertiga


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.031373,0.023529,0.035294,0.039216,0.023529,0.043137,0.023529,0.019608
MAE,7.588235,6.756863,8.678431,7.92549,6.666667,7.25098,6.752941,7.003922
RMSE,90.537255,70.74902,119.156863,89.619608,69.168627,84.968627,66.980392,72.227451
MAPE,88.649169,93.976024,116.416455,207.935275,94.458136,85.254794,132.749659,131.41554


In [28]:
print()
print()
print('METRICAS CROSS VALIDATION-> Salto de Pertiga')
tabla1



METRICAS CROSS VALIDATION-> Salto de Pertiga


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.045882,0.037647,0.048235,0.042353,0.029412,0.041176,0.057647,0.058824
MAE,8.156471,8.010588,9.202353,9.12,7.738824,7.561176,7.418824,7.325882
RMSE,105.535294,101.805882,130.698824,124.169412,95.301176,87.229412,87.491765,83.937647
MAPE,88.578987,85.473378,109.239143,183.469189,91.913335,129.988301,85.294791,109.21938


In [29]:
#PRUEBA -> Lanzamiento de Javalina
ddbbData = ddbbDataNorm[['Jt','Jt Points']]

ho = mainHoldOut(ddbbData)
cv = mainCrossVal(ddbbData)

tabla=pd.DataFrame(ho)
tabla1=pd.DataFrame(cv)

print()
print()
print('METRICAS HOLD OUT-> Lanzamiento de Javalina')
tabla



METRICAS HOLD OUT-> Lanzamiento de Javalina


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.043137,0.054902,0.043137,0.039216,0.031373,0.031373,0.070588,0.054902
MAE,8.717647,7.168627,7.603922,7.890196,6.74902,6.286275,6.737255,7.239216
RMSE,118.278431,82.266667,97.933333,89.34902,73.839216,56.764706,73.686275,86.8
MAPE,86.342421,67.550027,105.834641,207.761056,83.955489,130.786814,82.572242,113.527331


In [30]:
print()
print()
print('METRICAS CROSS VALIDATION-> Lanzamiento de Javalina')
tabla1



METRICAS CROSS VALIDATION-> Lanzamiento de Javalina


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.042353,0.036471,0.045882,0.041176,0.054118,0.04,0.041176,0.052941
MAE,8.216471,7.607059,8.223529,8.436471,6.683529,7.601176,6.972941,7.761176
RMSE,105.529412,92.014118,112.988235,107.092941,71.34,89.255294,77.457647,96.612941
MAPE,96.9045,73.866443,124.86411,167.158409,88.331324,146.234675,99.391106,130.754524


In [31]:
#PRUEBA -> 1500m lisos
ddbbData = ddbbDataNorm[['1500m','1500m Points']]

ho = mainHoldOut(ddbbData)
cv = mainCrossVal(ddbbData)

tabla=pd.DataFrame(ho)
tabla1=pd.DataFrame(cv)

print()
print()
print('METRICAS HOLD OUT-> 1500m Lisos')
tabla



METRICAS HOLD OUT-> 1500m Lisos


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.066667,0.039216,0.047059,0.039216,0.05098,0.047059,0.070588,0.035294
MAE,8.545098,6.996078,6.901961,7.890196,7.301961,10.384314,7.819608,8.109804
RMSE,115.12549,77.176471,77.764706,89.34902,87.294118,162.792157,102.682353,102.784314
MAPE,90.485839,91.013762,89.641904,207.761056,95.826325,75.057982,93.56959,151.793954


In [32]:
print()
print()
print('METRICAS CROSS VALIDATION-> 1500m Lisos')
tabla1



METRICAS CROSS VALIDATION-> 1500m Lisos


Unnamed: 0,KNN,REGLO,GNB,PERCEPTRON,MLP,DESGRA,ARBOL,BOSQUE
Accuracy,0.06,0.028235,0.044706,0.054118,0.030588,0.030588,0.044706,0.042353
MAE,8.307059,7.74,7.703529,8.614118,7.475294,8.642353,8.382353,8.037647
RMSE,112.041176,97.544706,97.588235,117.884706,89.08,116.294118,108.697647,102.12
MAPE,95.97519,86.065333,111.611528,116.633505,96.690097,170.050591,109.095913,151.748062
