# Parte 3: Estudio de diferentes algoritmos de machine learning

En este notebook vamos a aplicar diferentes algoritmos para estudiar su efectividad para este problema e intentaremos predicciones para los Juegos Olímpicos de 2021.

Los algoritmos que se aplican son: 
* Matriz de confusión
* Datos de entrenamiento y datos de test
* Knn - Vecinos cercanos
* Regresión Lineal
* Regresión Logística
* Naive Bayes
* Gaussian Naive Bayes
* K-means
* Perceptron
* MLP - Multi layer perceptron
* Decision Tree
* SVM - Supervised Vector Machine
* Descenso del gradiente
* Random Forest
* ¿¿RED NEURONAL??

In [1]:
import pandas as pd

import numpy as np
from numpy import * 

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
#Lee el excel generado en el otro documento .ipynb
ddbb=pd.read_excel("/Github/DecatlonEstadistics/resources/data.xlsx")
del ddbb['1500m NF']

ddbb['Country']=ddbb['Country'].str.lower()

ddbb.columns = ['Position', 'Athlete', 'Age', 'Country', 'Total Points', 'Year', 'Competition', '100m', 
                 '100m Points', 'Lj', 'Lj Points', 'Sp', 'Sp Points', 'Hj', 'Hj Points', '400m', '400m Points', 
                 '110m H', '110m H Points', 'Dt', 'Dt Points', 'Pv', 'Pv Points', 'Jt', 'Jt Points', 
                 '1500m Points', '1500m']

## 0. Matriz de confusión

In [3]:
def metricasAlgorithms(y_test, y_pred):
    metricas = []
    
    metricas.append(accuracy_score(y_test, y_pred))
    metricas.append(precision_score(y_test, y_pred, average='micro'))
    metricas.append(recall_score(y_test, y_pred, average='micro'))
    metricas.append(f1_score(y_test, y_pred, average='micro'))

    metricas.append(precision_score(y_test, y_pred, average='macro'))
    metricas.append(recall_score(y_test, y_pred, average='macro'))
    metricas.append(f1_score(y_test, y_pred, average='macro'))

    metricas.append(precision_score(y_test, y_pred, average='weighted'))
    metricas.append(recall_score(y_test, y_pred, average='weighted'))
    metricas.append(f1_score(y_test, y_pred, average='weighted'))
    
    return metricas

def metricasAlgorithmsRegl(X_test, y_test, regL):
    metricas = []
    
    metricas.append(regL.score(X_test,y_test))
    metricas.append(0)
    metricas.append(0)
    metricas.append(0)
    metricas.append(0)
    metricas.append(0)
    metricas.append(0)
    metricas.append(0)
    metricas.append(0)
    metricas.append(0)
    
    return metricas

## 1. Datos de entrenamiento y datos de test.

Los datos se entrenan en función a todas las pruebas resultantes y se clasifican en función a la posición obtenida.
El 70% de los datos se utilizan para entrenamiento.
El 30% restante se utiliza para tests.

In [4]:
ddbbData = ddbb[['100m','100m Points','Lj','Lj Points','Sp','Sp Points','Hj','Hj Points','400m Points','400m',
                 '110m H','110m H Points','Dt','Dt Points','Pv','Pv Points','Jt','Jt Points','1500m Points','1500m']]

X_train, X_test, y_train, y_test = train_test_split(ddbbData, ddbb['Position'], test_size=0.3)

## 2. Vecinos Cercanos

In [5]:
def knnFunction(X_train, X_test, y_train, y_test):
    
    knn = KNeighborsClassifier(n_neighbors=5)

    #Entrena el modelo
    knn.fit(X_train, y_train)

    #Predice para los datos de test
    y_pred = knn.predict(X_test)
    
    vector = metricasAlgorithms(y_test, y_pred)
    
    return(vector)

## 3. Regresión Lineal

In [6]:
def RegLinealFunction(X_train, X_test, y_train, y_test):
    
    regL = linear_model.LinearRegression()
    
    regL.fit(X_train, y_train)
    
    y_pred = regL.predict(X_test)
    
    return metricasAlgorithmsRegl(X_test, y_test, regL)

## 4. Regresión Logística

In [7]:
def RegLogisFunction(X_train, X_test, y_train, y_test):
    
    logreg = linear_model.LogisticRegression(random_state=0)

    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)
    
    return metricasAlgorithms(y_test, y_pred)

## 5. Naive Bayes

In [8]:
def NaiveBayesFunction(X_train, X_test, y_train, y_test):
    
    gnb = GaussianNB()

    gnb.fit(X_train, y_train)

    y_pred = gnb.predict(X_test)

    metricasAlgorithms(y_test, y_pred)
    
    return metricasAlgorithms(y_test, y_pred)

##  6. K-Means

In [9]:
kmeans = KMeans(n_clusters=2)

kmeans = KMeans(n_clusters=5).fit(ddbbData)
centroids = kmeans.cluster_centers_
#print(centroids)

## 7. Perceptron

In [10]:
def PerceptronFunction(X_train, X_test, y_train, y_test):
    
    percep = Perceptron(tol=1e-5, random_state=1)

    percep.fit(X_train, y_train)

    y_pred = percep.predict(X_test)

    metricasAlgorithms(y_test, y_pred)
    
    return metricasAlgorithms(y_test, y_pred)

## 8. MLP- Perceptron Multi Capa

In [11]:
def MLPFunction(X_train, X_test, y_train, y_test):

    mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)

    mlp.fit(X_train, y_train)

    y_pred = mlp.predict(X_test)

    metricasAlgorithms(y_test, y_pred)
    
    return metricasAlgorithms(y_test, y_pred)

## 9. SVM - Supervised Vector Machine

In [12]:
def SVMFunction(X_train, X_test, y_train, y_test):

    clf = svm.SVC(kernel='linear')

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    metricasAlgorithms(y_test, y_pred)

    return metricasAlgorithms(y_test, y_pred)

## 10. Descenso del gradiente

In [13]:
def DescensoGradFunction(X_train, X_test, y_train, y_test):

    dGrad = SGDClassifier(loss="hinge", penalty="l2")

    dGrad.fit(X_train, y_train)

    y_pred = dGrad.predict(X_test)

    metricasAlgorithms(y_test, y_pred)

    return metricasAlgorithms(y_test, y_pred)

## 11. Arbol de decision

In [14]:
def ArbolDecFunction(X_train, X_test, y_train, y_test):

    clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)

    clf = clf.fit(X_train,y_train)

    y_pred = clf.predict(X_test)

    metricasAlgorithms(y_test, y_pred)
    
    return metricasAlgorithms(y_test, y_pred)

## 12. Random Forest

In [15]:
def BosqueAlFunction(X_train, X_test, y_train, y_test):

    rForest=RandomForestClassifier(n_estimators=100)

    rForest.fit(X_train,y_train)

    y_pred=rForest.predict(X_test)

    metricasAlgorithms(y_test, y_pred)

    return metricasAlgorithms(y_test, y_pred)

In [16]:
knn = knnFunction(X_train, X_test, y_train, y_test)
RegLin = RegLinealFunction(X_train, X_test, y_train, y_test)
RegLog = RegLogisFunction(X_train, X_test, y_train, y_test)
NBayes = NaiveBayesFunction(X_train, X_test, y_train, y_test)
Percep = PerceptronFunction(X_train, X_test, y_train, y_test)
DGrad = DescensoGradFunction(X_train, X_test, y_train, y_test)
ADecis = ArbolDecFunction(X_train, X_test, y_train, y_test)
BAleatorio = BosqueAlFunction(X_train, X_test, y_train, y_test)
dataFrameDatos = pd.DataFrame({
                                'Metricas': ['Accuracy Micro','Precision Micro','Recall Micro','F1 Micro',
                                             'Precision Macro','Recall Macro','F1 Macro','Precision Weigthed',
                                             'Recall Weigthed','F1 Weigthed'],
                                'knn': knn,
                                'Regresion lineal': RegLin,
                                'Regresion logistica': RegLog,
                                'Naive bayes': NBayes,
                                'Perceptron': Percep,
                                'MLP': MLPFunction(X_train, X_test, y_train, y_test),
                                'SVM': SVMFunction(X_train, X_test, y_train, y_test),
                                'Descenso gradiente': DGrad,
                                'Arbol de decision': ADecis,
                                'Bosque aleatorios': BAleatorio 
                            })


In [17]:
dataFrameDatos

Unnamed: 0,Metricas,knn,Regresion lineal,Regresion logistica,Naive bayes,Perceptron,MLP,Descenso gradiente,Arbol de decision,Bosque aleatorios
0,Accuracy Micro,0.109804,0.466907,0.05098,0.066667,0.039216,0.035294,0.047059,0.054902,0.086275
1,Precision Micro,0.109804,0.0,0.05098,0.066667,0.039216,0.035294,0.047059,0.054902,0.086275
2,Recall Micro,0.109804,0.0,0.05098,0.066667,0.039216,0.035294,0.047059,0.054902,0.086275
3,F1 Micro,0.109804,0.0,0.05098,0.066667,0.039216,0.035294,0.047059,0.054902,0.086275
4,Precision Macro,0.06063,0.0,0.027995,0.034283,0.004627,0.001109,0.002672,0.018128,0.055505
5,Recall Macro,0.069008,0.0,0.041682,0.05251,0.028711,0.030303,0.033884,0.03367,0.051743
6,F1 Macro,0.056454,0.0,0.031735,0.034901,0.007848,0.002139,0.004935,0.022109,0.051881
7,Precision Weigthed,0.090623,0.0,0.036692,0.047994,0.006168,0.001291,0.003675,0.03119,0.095793
8,Recall Weigthed,0.109804,0.0,0.05098,0.066667,0.039216,0.035294,0.047059,0.054902,0.086275
9,F1 Weigthed,0.08798,0.0,0.040273,0.047633,0.01051,0.002491,0.006795,0.037868,0.088217
