Aldao Amoedo, Héctor

Cabaleiro Pintos, Laura

Cotardo Valcárcel, Donato José

Romero Conde, José

---

##   _Librerias_

In [None]:
import Pkg
Pkg.add("CSV")
Pkg.add("DataFrames")
Pkg.add("Statistics")
Pkg.add("StatsBase")
Pkg.add("Random")
Pkg.add("ScikitLearn")
Pkg.add("Plots")
Pkg.add("MLBase")
Pkg.add("DecisionTree")
Pkg.add("Suppressor")
Pkg.add("PyCall")
Pkg.add("HypothesisTests")
Pkg.add("PythonCall")

In [None]:
using CSV
using DataFrames
using Plots
using Statistics
using Random
using ScikitLearn
using ScikitLearn.Pipelines: Pipeline, named_steps, FeatureUnion
using ScikitLearn.GridSearch: GridSearchCV 
using StatsBase: mode
using PyCall
using Suppressor
using Conda
using HypothesisTests
# using PythonCall

@sk_import decomposition: (PCA, FastICA)
@sk_import discriminant_analysis: LinearDiscriminantAnalysis
@sk_import ensemble: (AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier)
@sk_import feature_selection: (SelectKBest, f_classif, mutual_info_classif, RFE)
@sk_import impute: SimpleImputer
@sk_import linear_model: LogisticRegression
@sk_import manifold: (LocallyLinearEmbedding, Isomap)
@sk_import neighbors: KNeighborsClassifier
@sk_import neural_network: MLPClassifier
@sk_import preprocessing: MinMaxScaler
@sk_import svm: SVC

@pyimport sklearn.ensemble as ensemble
pyimport_conda("xgboost", "xgboost")
@pyimport xgboost as xgb
Conda.add("lightgbm")
lightgbm = pyimport("lightgbm")
Conda.add("catboost")
catboost = pyimport("catboost")

## Preparación de los datos

El punto 1, está completado, aunque creo que el 1.3, donde preparamos los datos transformandolos y rellanando los nulos está mal.

### 1. Cargar los datos y descripción

In [2]:
df = CSV.read("Datos_Practica_Evaluacion_1.csv", DataFrame)

num_instancias, num_variables = size(df)
num_individuos = length(unique(df[:, 1]))
num_clases_salida = length(unique(df[:, end]))

println("Número de variables: $num_variables")
println("Número de instancias: $num_instancias")
println("Número de individuos: $num_individuos")
println("Número de clases de salida: $num_clases_salida")

Número de variables: 563
Número de instancias: 10299
Número de individuos: 30
Número de clases de salida: 6


Los datos dados ya están cargados, y como podemos observar tienen:
* 563 Variables
* 10299 Instancias
* 30 Individuos
* 6 Clases de salida

### 2. Calcular porcentaje de nulos

In [3]:
num_nulos_totales = 0

for col ∈ names(df)
    num_nulos = count(ismissing, df[:, col])
    num_nulos_totales += num_nulos
    porcentaje_nulos = (num_nulos / num_instancias) * 100
end

porcentaje_nulos_totales = (num_nulos_totales / (num_instancias * num_variables)) * 100
println("Porcentaje total de nulos en el conjunto: $porcentaje_nulos_totales%")  

Porcentaje total de nulos en el conjunto: 0.004656507546905259%


### 3. Preparar los datos para las técnicas de clasificación

Para rellenar valores faltantes tenemos que hacernos una idea de que tipo de datos encontraremos.

In [4]:
for col ∈ names(df)
    if eltype(df[!, col]) == Union{Missing, Float64}
        df[ismissing.(df[!, col]), col] .= mean(skipmissing(df[!, col]))
        df[!, col] = Float64.(df[!, col]) 
    end
end

println("Valores nulos rellenados.")

Valores nulos rellenados.


In [5]:
Set([eltype(df[!,col]) for col in names(df)])  # ya no hay missing

Set{DataType} with 3 elements:
  String31
  Float64
  Int64

### 4. Segmentar el 10% de los datos usando HoldOut

In [6]:
Random.seed!(172)

holdout_individuos = shuffle(unique(df[:, :subject]))[1:Int(round(0.1 * length(unique(df[:, :subject]))))]  
holdout_df = filter(fila -> fila.subject in holdout_individuos, df)
train_df = filter(fila -> !(fila.subject in holdout_individuos), df)

println("Individuos en el holdout: ", holdout_individuos)
println("Tamaño del conjunto de entrenamiento: $(size(train_df)[1])")
println("Tamaño del conjunto de holdout: $(size(holdout_df)[1])")

Individuos en el holdout: [4, 12, 29]
Tamaño del conjunto de entrenamiento: 9318
Tamaño del conjunto de holdout: 981


### 5. Fold y escalado

In [7]:
# separacion X Y
train = Array(train_df)
test = Array(holdout_df)

X_train = train[:,1:end-1]
Y_train = train[:,end]
X_test = test[:,1:end-1]
Y_test = test[:,end];

# k fold
especificacionCV = ScikitLearn.CrossValidation.KFold(size(train)[1], n_folds=5)
#folds = [(train[indicesTrain,:], train[indicesTest,:]) for (indicesTrain, indicesTest) ∈ especificacionCV]

# escalado
X_train = fit_transform!(MinMaxScaler(), X_train);
X_test = fit_transform!(MinMaxScaler(), X_test);
#folds = [((fit_transform!(MinMaxScaler(), train[:,1:end-1]),Vector(train[:,end])), (fit_transform!(MinMaxScaler(), test[:,1:end-1]),Vector(test[:,end]))) for (train, test) ∈ folds ];

# de esta forma:
# folds tiene los 5 folds (escalado cada uno independientemente)
# folds[1] es un fold
# folds[1][1] es el entrenamiento de ese fold
# folds[1][1][1] es el X del entrenamiento de ese fold
print()

---

# Modelos Basicos

In [10]:
function Mejores_parametros(df)
    clasificadores = unique(df[!,:clasificador])
    println("MEJORES RESULTADOS POR CLASIFICADOR:")
    for nombreClasificador in clasificadores
        df_clasificador = filter(fila -> fila.clasificador == nombreClasificador, df)
        indice = argmax(df_clasificador[!, Symbol("Accuracy")])
        mejor_fila = df_clasificador[indice, :]
        println("Clasificador: ", mejor_fila.clasificador)
        println("Precisión: ", mejor_fila.Accuracy)
        println("Parámetros: ", mejor_fila.parametros)
    end
end

Mejores_parametros (generic function with 1 method)

---

In [9]:
function plot_transformed_data(name::String, reductor, X, y)

    X_reducida = fit_transform!(reducer, X, y)
    plot = scatter(X_reduced[:, 1], X_reduced[:, 2], group=y, legend=:topright, title=name,
                xlabel="Componente 1", ylabel="Componente 2", markersize=5)
    return plot
end

plot_transformed_data (generic function with 1 method)

In [11]:
resultadosModelosBasicos = DataFrame(filtrado = String[], reduccion = String[], clasificador = String[], Accuracy = Float64[], parametros = String[])
plot_distribucion = @layout [a b c; d e]

filtrado = Dict(
   #"nada" => "passthrough",
   "anova" => SelectKBest(score_func=f_classif),
   "mi" => SelectKBest(score_func=mutual_info_classif),
   "rfe" => RFE(LogisticRegression(max_iter=10),step=0.5)
 )

reduccion = Dict(
   #"nada" => "passthrough",
   "pca" => PCA(),
   "lda" => LinearDiscriminantAnalysis(),
   "ica" => FastICA(),
   #"isomap" => Isomap(n_neighbors=25),
   #"lle" => LocallyLinearEmbedding(),
 )

clasificacion = Dict(
    "mlp" => [MLPClassifier(max_iter=10), Dict(:classifier__hidden_layer_sizes => [[50], [100], [100, 50]])],
    "knn" => [KNeighborsClassifier(), Dict(:classifier__n_neighbors =>[1, 10, 20])],
    "svm" => [SVC(), Dict(:classifier__C =>[0.1, 1, 10])]
)

for (nombreFiltro, filtro) in filtrado
    for (nombreReduccion, reduccion) in reduccion
        for (nombreClasificador, valor) in clasificacion

            clasificador = valor[1]
            parametros = valor[2]
            modelo = Pipeline([
                ("filtro", filtro),
                ("reduccion", reduccion),
                ("classifier", clasificador) 
            ])

            busqueda = GridSearchCV(modelo, parametros, cv=especificacionCV)
            fit!(busqueda, X_train, Y_train)
            mejorModelo = busqueda.best_estimator_
            mejoresParametros = busqueda.best_params_
            accuracy = busqueda.best_score_
            Y_pred = predict(mejorModelo, X_test)
            accuracy = sum(Y_pred .== Y_test) / length(Y_test)
            push!(resultadosModelosBasicos, (nombreFiltro, nombreReduccion, nombreClasificador, accuracy, string(mejoresParametros)))
            println("Filtrado: $nombreFiltro,\nReducción: $nombreReduccion,\nClasificador: $nombreClasificador,\nparámetros: $(mejoresParametros),\nPrecisión: $accuracy\n")
        end
    end
end

Mejores_parametros(resultadosModelosBasicos)


Filtrado: anova,
Reducción: lda,
Clasificador: knn,
parámetros: Dict{Symbol, Any}(:classifier__n_neighbors => 20),
Precisión: 0.7512487512487512

Filtrado: anova,
Reducción: lda,
Clasificador: mlp,
parámetros: Dict{Symbol, Any}(:classifier__hidden_layer_sizes => [100]),
Precisión: 0.7282717282717283





Filtrado: anova,
Reducción: lda,
Clasificador: svm,
parámetros: Dict{Symbol, Any}(:classifier__C => 1.0),
Precisión: 0.7602397602397603

Filtrado: anova,
Reducción: ica,
Clasificador: knn,
parámetros: Dict{Symbol, Any}(:classifier__n_neighbors => 20),
Precisión: 0.7092907092907093





Filtrado: anova,
Reducción: ica,
Clasificador: mlp,
parámetros: Dict{Symbol, Any}(:classifier__hidden_layer_sizes => [100, 50]),
Precisión: 0.7972027972027972





Filtrado: anova,
Reducción: ica,
Clasificador: svm,
parámetros: Dict{Symbol, Any}(:classifier__C => 1.0),
Precisión: 0.7702297702297702





Filtrado: anova,
Reducción: pca,
Clasificador: knn,
parámetros: Dict{Symbol, Any}(:classifier__n_neighbors => 20),
Precisión: 0.7132867132867133

Filtrado: anova,
Reducción: pca,
Clasificador: mlp,
parámetros: Dict{Symbol, Any}(:classifier__hidden_layer_sizes => [100, 50]),
Precisión: 0.7592407592407593





Filtrado: anova,
Reducción: pca,
Clasificador: svm,
parámetros: Dict{Symbol, Any}(:classifier__C => 10.0),
Precisión: 0.7832167832167832



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Filtrado: rfe,
Reducción: lda,
Clasificador: knn,
parámetros: Dict{Symbol, Any}(:classifier__n_neighbors => 10),
Precisión: 0.8291708291708292


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Filtrado: rfe,
Reducción: lda,
Clasificador: mlp,
parámetros: Dict{Symbol, Any}(:classifier__hidden_layer_sizes => [100]),
Precisión: 0.7972027972027972


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Filtrado: rfe,
Reducción: lda,
Clasificador: svm,
parámetros: Dict{Symbol, Any}(:classifier__C => 10.0),
Precisión: 0.7862137862137862

Filtrado: rfe,
Reducción: ica,
Clasificador: knn,
parámetros: Dict{Symbol, Any}(:classifier__n_neighbors => 1),
Precisión: 0.15184815184815184


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Filtrado: rfe,
Reducción: ica,
Clasificador: mlp,
parámetros: Dict{Symbol, Any}(:classifier__hidden_layer_sizes => [100]),
Precisión: 0.4915084915084915


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Filtrado: rfe,
Reducción: ica,
Clasificador: svm,
parámetros: Dict{Symbol, Any}(:classifier__C => 10.0),
Precisión: 0.13786213786213786



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Filtrado: rfe,
Reducción: pca,
Clasificador: knn,
parámetros: Dict{Symbol, Any}(:classifier__n_neighbors => 20),
Precisión: 0.8171828171828172



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Filtrado: rfe,
Reducción: pca,
Clasificador: mlp,
parámetros: Dict{Symbol, Any}(:classifier__hidden_layer_sizes => [100, 50]),
Precisión: 0.8981018981018981


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Filtrado: rfe,
Reducción: pca,
Clasificador: svm,
parámetros: Dict{Symbol, Any}(:classifier__C => 10.0),
Precisión: 0.8901098901098901

Filtrado: mi,
Reducción: lda,
Clasificador: knn,
parámetros: Dict{Symbol, Any}(:classifier__n_neighbors => 20),
Precisión: 0.6253746253746254

Filtrado: mi,
Reducción: lda,
Clasificador: mlp,
parámetros: Dict{Symbol, Any}(:classifier__hidden_layer_sizes => [100, 50]),
Precisión: 0.6493506493506493





Filtrado: mi,
Reducción: lda,
Clasificador: svm,
parámetros: Dict{Symbol, Any}(:classifier__C => 10.0),
Precisión: 0.5364635364635365

Filtrado: mi,
Reducción: ica,
Clasificador: knn,
parámetros: Dict{Symbol, Any}(:classifier__n_neighbors => 10),
Precisión: 0.5494505494505495





Filtrado: mi,
Reducción: ica,
Clasificador: mlp,
parámetros: Dict{Symbol, Any}(:classifier__hidden_layer_sizes => [100, 50]),
Precisión: 0.6233766233766234





Filtrado: mi,
Reducción: ica,
Clasificador: svm,
parámetros: Dict{Symbol, Any}(:classifier__C => 10.0),
Precisión: 0.5014985014985015





Filtrado: mi,
Reducción: pca,
Clasificador: knn,
parámetros: Dict{Symbol, Any}(:classifier__n_neighbors => 20),
Precisión: 0.6163836163836164

Filtrado: mi,
Reducción: pca,
Clasificador: mlp,
parámetros: Dict{Symbol, Any}(:classifier__hidden_layer_sizes => [100, 50]),
Precisión: 0.6233766233766234





Filtrado: mi,
Reducción: pca,
Clasificador: svm,
parámetros: Dict{Symbol, Any}(:classifier__C => 10.0),
Precisión: 0.5234765234765235

MEJORES RESULTADOS POR CLASIFICADOR:
Clasificador: knn
Precisión: 0.8291708291708292
Parámetros: Dict{Symbol, Any}(:classifier__n_neighbors => 10)
Clasificador: mlp
Precisión: 0.8981018981018981
Parámetros: Dict{Symbol, Any}(:classifier__hidden_layer_sizes => [100, 50])
Clasificador: svm
Precisión: 0.8901098901098901
Parámetros: Dict{Symbol, Any}(:classifier__C => 10.0)


**MEJORES RESULTADOS POR CLASIFICADOR:**

**Clasificador: knn**
- Precisión: 0.8291708291708292
- Parámetros: Dict{Symbol, Any}(:classifier__n_neighbors => 10)

**Clasificador: mlp**
- Precisión: 0.8981018981018981
- Parámetros: Dict{Symbol, Any}(:classifier__hidden_layer_sizes => [100, 50])

**Clasificador: svm**
- Precisión: 0.8901098901098901
- Parámetros: Dict{Symbol, Any}(:classifier__C => 10.0)

---

# Ensembles

---

10. Adicionalmente, con los datos sólo con el tratamiento de Filtrado ANOVA, recrear
las siguientes técnicas
 - BaggingClassifier con clasificador base KNN con número de vecinos 5 y
número de estimadores 10 y 50
 -  AdaBoosting con estimadores SVM con kernel lineal siendo el número de
estimadores 5.
 -  GBM (GradientBoostingClasifier), con 50 estimadores y un learning_rate de
0.2

In [11]:
resultadosEjercicio10 = DataFrame(clasificador = String[], Accuracy = Float64[], parametros = String[])

clasificadoresEjercicio10 = Dict(
    "bagging10" => BaggingClassifier(estimator=KNeighborsClassifier(n_neighbors=5), n_estimators=10),
    "bagging50" => BaggingClassifier(estimator=KNeighborsClassifier(n_neighbors=5), n_estimators=50),
    "adaboosting" => AdaBoostClassifier(estimator=SVC(kernel="linear"), algorithm="SAMME", n_estimators=5),
    "gbm" => GradientBoostingClassifier(learning_rate=0.2, n_estimators=50)
)

for (nombreClasificador, clasificador) in clasificadoresEjercicio10
    modelo = Pipeline([
        ("filtro", SelectKBest(score_func=f_classif)),
        ("classifier", clasificador)
    ])

    fit!(modelo, X_train, Y_train)
    Y_pred = predict(modelo, X_test)
    accuracy = sum(Y_pred .== Y_test) / length(Y_test)
    push!(resultadosEjercicio10, (nombreClasificador, accuracy, string(clasificador.get_params())))
    println("Clasificador: $nombreClasificador Precisión: $accuracy")
end

Mejores_parametros(resultadosEjercicio10)


Clasificador: bagging50 Precisión: 0.7102897102897103
Clasificador: bagging10 Precisión: 0.6983016983016983
Clasificador: adaboosting Precisión: 0.16983016983016982
Clasificador: gbm Precisión: 0.7582417582417582
MEJORES RESULTADOS POR CLASIFICADOR:
Clasificador: bagging50
Precisión: 0.7102897102897103
Parámetros: Dict{Any, Any}("estimator__weights" => "uniform", "max_features" => 1.0, "estimator__metric_params" => nothing, "estimator" => PyObject KNeighborsClassifier(), "verbose" => 0, "estimator__leaf_size" => 30, "estimator__n_jobs" => nothing, "estimator__n_neighbors" => 5, "oob_score" => false, "max_samples" => 1.0, "estimator__algorithm" => "auto", "estimator__p" => 2, "n_jobs" => nothing, "warm_start" => false, "random_state" => nothing, "bootstrap" => true, "n_estimators" => 50, "bootstrap_features" => false, "estimator__metric" => "minkowski")
Clasificador: bagging10
Precisión: 0.6983016983016983
Parámetros: Dict{Any, Any}("estimator__weights" => "uniform", "max_features" => 1

In [12]:
Mejores_parametros(resultadosEjercicio10)

MEJORES RESULTADOS POR CLASIFICADOR:
Clasificador: bagging50
Precisión: 0.7102897102897103
Parámetros: Dict{Any, Any}("estimator__weights" => "uniform", "max_features" => 1.0, "estimator__metric_params" => nothing, "estimator" => PyObject KNeighborsClassifier(), "verbose" => 0, "estimator__leaf_size" => 30, "estimator__n_jobs" => nothing, "estimator__n_neighbors" => 5, "oob_score" => false, "max_samples" => 1.0, "estimator__algorithm" => "auto", "estimator__p" => 2, "n_jobs" => nothing, "warm_start" => false, "random_state" => nothing, "bootstrap" => true, "n_estimators" => 50, "bootstrap_features" => false, "estimator__metric" => "minkowski")
Clasificador: bagging10
Precisión: 0.6983016983016983
Parámetros: Dict{Any, Any}("estimator__weights" => "uniform", "max_features" => 1.0, "estimator__metric_params" => nothing, "estimator" => PyObject KNeighborsClassifier(), "verbose" => 0, "estimator__leaf_size" => 30, "estimator__n_jobs" => nothing, "estimator__n_neighbors" => 5, "oob_score" =

---

11. Entrenar con el conjunto completo de entrenamiento (todo lo que componía el 5-
fold cross-validation) y testear son el 10% reservado
 - Coger las 5 mejores combinaciones de los modelos anteriores de
clasificación, (1 KNN, 1 SVM, 1 MLP, 1 Bagging y 1 AdaBoosting)
 - Crear un Random Forest con valor para los estimadores del 500 y
profundidad máxima de 10
 - Crear un Hard Voting con las mejores combinaciones del KNN, SVM y MLP
(uno para cada una de las técnicas)
 - Crear un Soft Voting con las mejores combinaciones del KNN, SVM y MLP
(uno para cada una de las técnicas) para los pesos coger el porcentaje de
acierto en test de cada una de las combinaciones en el 5-fold cross-
valiadation
 - Crear un Ensemble Stacking con MLP como clasificador final, así mismo,
use como base las mejores combinaciones del SVM, KNN y MLP
 - Crear un XGBoost con los valores por defecto
 - Crear un LightGBM, con los valores por defecto
 - Crear un Catboost, con los valores por defecto

In [None]:
# using DecisionTree, ScikitLearn, XGBoost, LightGBM, CatBoost
# using DataFrames, Statistics
# using ScikitLearn.CrossValidation: KFold
# using ScikitLearn: fit!, predict 

In [8]:
# Ejercicio 11

# Los mejores modelos obtenidos previamente
mejor_knn = KNeighborsClassifier(n_neighbors=10) 
mejor_svm = SVC(C=10.0)
mejor_mlp = MLPClassifier(hidden_layer_sizes=[100, 50])
mejor_bagging = BaggingClassifier(estimator=KNeighborsClassifier(n_neighbors=5), n_estimators=50)
mejor_adaboost = AdaBoostClassifier(estimator=SVC(kernel="linear", probability=true), n_estimators=5)


# Entrenar los mejores modelos en todo el conjunto de entrenamiento
modelos = Dict(
    "KNN" => mejor_knn,
    "SVM" => mejor_svm,
    "MLP" => mejor_mlp,
    "Bagging" => mejor_bagging,
    "AdaBoost" => mejor_adaboost
)

accuracys_modelos = Dict()

for (nombre, modelo) in modelos
    fit!(modelo, X_train, Y_train)
    predicciones = predict(modelo, X_test)
    accuracy = sum(predicciones .== Y_test) / length(Y_test)
    accuracys_modelos[nombre] = accuracy
    println("Modelo: $nombre, Accuracy en test: $accuracy")
end

println("Accuracys de los modelos en test: ", accuracys_modelos)


Modelo: SVM, Accuracy en test: 0.9215086646279307
Modelo: Bagging, Accuracy en test: 0.8470948012232415
Modelo: MLP, Accuracy en test: 0.9245667686034659
Modelo: AdaBoost, Accuracy en test: 0.4352701325178389
Modelo: KNN, Accuracy en test: 0.854230377166157
Accuracys de los modelos en test: Dict{Any, Any}("SVM" => 0.9215086646279307, "Bagging" => 0.8470948012232415, "MLP" => 0.9245667686034659, "AdaBoost" => 0.4352701325178389, "KNN" => 0.854230377166157)


In [None]:
# Crear y entrenar el modelo Random Forest
rf = ensemble.RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
fit!(rf, X_train, Y_train)
rf_pred = predict(rf, X_test)
rf_accuracy = sum(rf_pred .== Y_test) / length(Y_test)
println("Random Forest Accuracy: $rf_accuracy")

Random Forest Accuracy: 0.8063200815494393


In [10]:
# using Conda
# Conda.add("scikit-learn")
# using PyCall

In [18]:
# @pyimport sklearn.ensemble as ensemble
# VotingClassifier = ensemble.VotingClassifier

In [12]:
# Hard voting
voting_models = Dict(
    "KNN" => mejor_knn,
    "SVM" => mejor_svm,
    "MLP" => mejor_mlp,
    "Bagging" => mejor_bagging,
    "AdaBoost" => mejor_adaboost
) 


function hard_voting(models, X_test)
    predicciones = [predict(model, X_test) for model in values(models)]
    combined = [mode([pred[i] for pred in predicciones]) for i in 1:size(X_test, 1)]
    return combined
end

hard_voting_pred = hard_voting(voting_models, X_test)
hard_voting_accuracy = sum(hard_voting_pred .== Y_test) / length(Y_test)
println("Hard Voting Accuracy: $hard_voting_accuracy")


Hard Voting Accuracy: 0.8827726809378186


In [13]:
function soft_voting(models, X_test)
    # Predicciones de cada modelo
    predicciones = [predict(model, X_test) for model in values(models)]
    
    # Lista para contar los votos de cada instancia
    votos = [Dict{String, Int}() for _ in 1:size(X_test, 1)]
    
    # Contar las predicciones de cada modelo
    for pred in predicciones
        for i in 1:size(X_test, 1)
            clase = pred[i]
            if haskey(votos[i], clase)
                votos[i][clase] += 1
            else
                votos[i][clase] = 1
            end
        end
    end
    
    # Seleccionar la clase con mayor número de votos para cada instancia
    combined = [argmax(v) for v in votos]
    
    return combined
end

soft_voting_pred = soft_voting(voting_models, X_test)
soft_voting_accuracy = sum(soft_voting_pred .== Y_test) / length(Y_test)
println("Soft Voting Accuracy: $soft_voting_accuracy")

Soft Voting Accuracy: 0.8848114169215087


In [19]:
StackingClassifier = ensemble.StackingClassifier


# Crear el clasificador de Ensemble Stacking
base_estimators = [
    ("svm", mejor_svm),
    ("knn", mejor_knn),
    ("mlp", mejor_mlp)
]

final_estimator = MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000, random_state=42)

stacking_classifier = StackingClassifier(
    estimators=base_estimators,
    final_estimator=final_estimator,
    cv=5  # Validación cruzada
)

# Entrenar el modelo
fit!(stacking_classifier, X_train, Y_train)

# Realizar predicciones
stacking_classifier_predictions = predict(stacking_classifier, X_test)

# Evaluar la precisión
stacking_classifier_accuracy =  sum(stacking_classifier_predictions .== Y_test) / length(Y_test)
println("Stacking Classifier Accuracy: $stacking_classifier_accuracy")

Stacking Classifier Accuracy: 0.9133537206931702


In [15]:
# Mapeo de etiquetas categóricas a numéricas
label_mapping = Dict(
    "LAYING" => 0,
    "SITTING" => 1,
    "STANDING" => 2,
    "WALKING" => 3,
    "WALKING_DOWNSTAIRS" => 4,
    "WALKING_UPSTAIRS" => 5
)

# Convertir las etiquetas de entrenamiento y prueba
Y_train_num = [label_mapping[label] for label in Y_train]
Y_test_num = [label_mapping[label] for label in Y_test];

In [16]:
#XGBoost

# Crear y entrenar el modelo XGBoost
xgb_model = xgb.XGBClassifier()
fit!(xgb_model, X_train, Y_train_num)

# Realizar predicciones
xgb_pred = predict(xgb_model, X_test)

# Calcular precisión
xgb_accuracy = sum(xgb_pred .== Y_test_num) / length(Y_test_num)
println("XGBoost Accuracy: $xgb_accuracy")

XGBoost Accuracy: 0.8644240570846076


In [17]:
# LightGBM

# using PyCall
lightgbm = pyimport("lightgbm")

lightgbm_model = lightgbm.LGBMClassifier()

# Declarar la variable antes del bloque @suppress para poder printearla después
lightgbm_accuracy = 0.0

@suppress begin
    # Entrenamiento del modelo
    fit!(lightgbm_model, X_train, Y_train_num)
    lightgbm_pred = predict(lightgbm_model, X_test)

    # Cálculo de la precisión
    lightgbm_accuracy = sum(lightgbm_pred .== Y_test_num) / length(Y_test_num)
end

# Imprimir solo el accuracy
println("LightGBM Accuracy: $lightgbm_accuracy")

LightGBM Accuracy: 0.8287461773700305


In [21]:
#CatBoost

# Importar numpy
np = pyimport("numpy")

# Convertir X_train y Y_train_num a np.ndarray
X_train_np = np.array(X_train)
Y_train_np = np.array(Y_train_num)

# Convertir X_test y Y_test_num a np.ndarray
X_test_np = np.array(X_test)
Y_test_np = np.array(Y_test_num)

# Crear y entrenar el modelo de CatBoost
catboost = pyimport("catboost")
catboost_model = catboost.CatBoostClassifier(verbose=0)
catboost_model.fit(X_train_np, Y_train_np)

# Predecir y calcular el accuracy
catboost_pred = catboost_model.predict(X_test_np)
catboost_accuracy = sum(catboost_pred .== Y_test_np) / length(Y_test_np)
println("CatBoost Accuracy: $catboost_accuracy")

CatBoost Accuracy: 0.9541284403669725


## Conclusiones

12. Imprima la importancia de las variables seleccionadas por el Random Forest y por
el XGBoost ordenadas por importancia.

In [28]:
# Obtener las importancias de las variables seleccionadas por RandomForest
importances = rf[:feature_importances_]

# Crear un DataFrame con las importancias
variables = names(train_df)[1:end-1]  # Nombres de las características
importance_df = DataFrame(variable=variables, importance=importances)

# Ordenar por importancia
sort!(importance_df, :importance, rev=true)

# Mostrar las importancias
println(importance_df)

[1m562×2 DataFrame[0m
[1m Row [0m│[1m variable                          [0m[1m importance  [0m
     │[90m String                            [0m[90m Float64     [0m
─────┼────────────────────────────────────────────────
   1 │ tGravityAcc-energy()-X             0.0357965
   2 │ tGravityAcc-min()-Y                0.0345304
   3 │ angle(X,gravityMean)               0.0319969
   4 │ tGravityAcc-max()-Y                0.0287416
   5 │ tGravityAcc-min()-X                0.028299
   6 │ tGravityAcc-max()-X                0.0273522
   7 │ angle(Y,gravityMean)               0.0253365
   8 │ tGravityAcc-mean()-Y               0.0240191
   9 │ tGravityAcc-mean()-X               0.0212595
  10 │ tBodyAccJerkMag-energy()           0.0149949
  11 │ tGravityAcc-energy()-Y             0.0147378
  12 │ fBodyAccJerk-bandsEnergy()-1,16    0.0126455
  13 │ fBodyAccMag-mad()                  0.0121971
  14 │ fBodyAccMag-std()                  0.0110797
  15 │ fBodyAcc-bandsEnergy()-1,24        

In [29]:
# Obtener las importancias de las variables seleccionadas por XGBoost
importancias = xgb_model.feature_importances_
features_df = DataFrame(variable=names(train_df)[1:end-1], importance=importancias)

# Ordenar por importancia
sort!(features_df, :importance, rev=true)

# Mostrar las características más importantes
println(features_df)


[1m562×2 DataFrame[0m
[1m Row [0m│[1m variable                          [0m[1m importance  [0m
     │[90m String                            [0m[90m Float32     [0m
─────┼────────────────────────────────────────────────
   1 │ fBodyAccJerk-std()-Y               0.154659
   2 │ fBodyAcc-skewness()-X              0.0731409
   3 │ tGravityAcc-min()-X                0.0639002
   4 │ fBodyAcc-bandsEnergy()-1,8.2       0.0529609
   5 │ fBodyAccJerk-bandsEnergy()-1,24.1  0.0497774
   6 │ fBodyAccMag-mad()                  0.0457732
   7 │ tBodyAcc-std()-Y                   0.034913
   8 │ fBodyGyro-std()-X                  0.0314306
   9 │ tGravityAcc-arCoeff()-Z,1          0.0252138
  10 │ fBodyGyro-mad()-X                  0.0221549
  11 │ fBodyAccJerk-mad()-Z               0.0214349
  12 │ fBodyAccMag-std()                  0.0205599
  13 │ fBodyAcc-energy()-X                0.0201363
  14 │ angle(Y,gravityMean)               0.0146016
  15 │ tGravityAcc-max()-X                0

13. Realice un contraste de hipótesis para determinar cuál de los modelos y extraiga
las conclusiones. 

In [24]:
# Precisión de los modelos (sustituye por los valores reales obtenidos)
valores = Dict(
    "RF" => rf_accuracy,
    "KNN" => accuracys_modelos["KNN"],
    "SVM" => accuracys_modelos["SVM"],
    "MLP" => accuracys_modelos["MLP"],
    "Bagging" => accuracys_modelos["Bagging"],
    "AdaBoost" => accuracys_modelos["AdaBoost"],
    "HardVoting" => hard_voting_accuracy,
    "SoftVoting" => soft_voting_accuracy,
    "XGBoost" => xgb_accuracy,
    "LightGBM" => lightgbm_accuracy,
    "CatBoost" => catboost_accuracy
)

# Extraer los nombres y valores
nombres_modelos = collect(keys(valores))
valores_accuracy = collect(values(valores))

# Identificar el mejor modelo
nombre_mejor_modelo = nombres_modelos[argmax(valores_accuracy)]
mejor_accuracy = maximum(valores_accuracy)

println("El mejor modelo es: $nombre_mejor_modelo con accuracy = $mejor_accuracy")

El mejor modelo es: CatBoost con accuracy = 0.9541284403669725


In [27]:
# Comparar el mejor modelo contra los demás usando Wilcoxon Signed-Rank Test
for (model, accuracy) in zip(nombres_modelos, valores_accuracy)
    if model != nombre_mejor_modelo
        println("xxxxxxxx")
        println("\nComparando $nombre_mejor_modelo contra $model:")

        # Crear los vectores de diferencias
        diferencias = [mejor_accuracy - accuracy for _ in 1:length(Y_test)]

        # Aplicar el test de Wilcoxon
        test_result = SignedRankTest(diferencias)

        # Obtener el valor p utilizando la función pvalue
        p_valor = pvalue(test_result)

        # Imprimir los resultados del test
        println("Wilcoxon Statistic: $test_result")
        println("p-value: $p_valor")

        if p_valor < 0.05
            println("Diferencia significativa encontrada (p < 0.05).")
        else
            println("No hay diferencia significativa (p >= 0.05).")
        end
    end
end


Comparando CatBoost contra SVM:
Wilcoxon Statistic: Approximate Wilcoxon signed rank test
-------------------------------------
Population details:
    parameter of interest:   Location parameter (pseudomedian)
    value under h_0:         0
    point estimate:          0.0326198
    95% confidence interval: (0.03262, 0.03262)

Test summary:
    outcome with 95% confidence: reject h_0
    two-sided p-value:           <1e-99

Details:
    number of observations:      981
    Wilcoxon rank-sum statistic: 481671.0
    rank sums:                   [481671.0, 0.0]
    adjustment for ties:         9.44075e8
    normal approximation (μ, σ): (2.40836e5, 7689.29)

p-value: 2.427202495544136e-215
Diferencia significativa encontrada (p < 0.05).

Comparando CatBoost contra HardVoting:
Wilcoxon Statistic: Approximate Wilcoxon signed rank test
-------------------------------------
Population details:
    parameter of interest:   Location parameter (pseudomedian)
    value under h_0:         0
    p