In [None]:
using CSV
using DataFrames
using Random

include("../src/data.jl");
Random.seed!(104);

### Creación del dataset

In [None]:
if !isfile("../resources/dataset.csv");
    dataset_folder = "../resources/csv_files";
    dataset = CreateDataset(dataset_folder);
    CSV.write("../resources/dataset.csv", dataset);
else
    dataset = loadDataset("dataset", "../resources"); 
end;

### Summary

In [None]:
n_ins, n_vars = size(dataset);
n_inv = length(unique(dataset[:, 1]));
println("Número de variables: ", n_vars - 2);
println("Número de instancias: ", n_ins);
println("Número de individuos: ", n_inv);
println("Número de clases: ", length(unique(dataset[:, end])));

### Gestión de ausentes

In [None]:
df_cols = eachcol(dataset);
missing_indx = findall.(ismissing, df_cols);
uncomplete_cols = .!isempty.(missing_indx);
print(uncomplete_cols)
missing_ratios = (length.(missing_indx) ./ n_ins) * 100;
println.("col: ", names(dataset[:, uncomplete_cols]), "| ratio de nulos: ", missing_ratios[uncomplete_cols], "%");
println("Ratio sobre el total: ", mean(missing_ratios),"%");

In [None]:
for col in df_cols[uncomplete_cols] replace!(col, missing => mean(skipmissing(col))) end;
println("Valores ausentes sustituidos con la media de cada columna");
print("Número de ausentes: ", sum(length.(findall.(ismissing, df_cols))));

### HoldOut

In [None]:
train_index, test_index = holdOut(n_inv, 0.10);
test_dataset = dataset[findall(x -> any(x .== test_index), dataset[:, "subject"]), :];
train_dataset = dataset[findall(x -> any(x .== train_index), dataset[:, "subject"]), :];
println("Individuos en el dataset de test: ", test_index);
println("Individuos en el dataset de train: ", train_index);
println("Dimensiones de test: ", size(test_dataset), " Dimensiones de train: ", size(train_dataset));

### OneHotEncoding

In [None]:
train_targets = OneHotEncoding(train_dataset[!, "Activity"]);
train_inputs = train_dataset[!, Not("Activity")];
test_targets = OneHotEncoding(test_dataset[!, "Activity"]);
test_inputs = test_dataset[!, Not("Activity")];
println(size(train_targets), size(train_inputs));
println(size(test_targets), size(test_inputs));

### CrossValidation

In [None]:
cross_val_index = crossvalidation(train_inputs[!, "subject"], 5);
cv_info = [(length(findall(x -> x == fold, cross_val_index)), unique(train_inputs[findall(x -> x == fold, cross_val_index), "subject"])) for fold in 1:5];
println("Fold | Size  | Individuals");
println("-----+-------+---------------------------");
for (i, (size, inds)) in enumerate(cv_info);
    inds_str = join(inds, ", ");
    print(i, "\t", size, "\t", inds_str, "\n");
end;
train_inputs = train_inputs[!, Not("subject")]
test_inputs = test_inputs[!, Not("subject")]

### Normalización

In [None]:
train_cols = eachcol(train_inputs);
norm_values = minimum.(train_cols), maximum.(train_cols);
train_inputs .-= norm_values[1]';
train_inputs ./= (norm_values[2] .- norm_values[1])'
test_inputs .-= norm_values[1]';
test_inputs ./= (norm_values[2] .- norm_values[1])'

In [37]:
if !isfile("variables/train_dataset.jld2")
    mkdir("variables")
    @save "variables/train_dataset.jld2" train_inputs train_targets
    @save "variables/test_dataset.jld2" test_inputs test_targets
    @save "variables/cross_val_index.jdl2" cross_val_index
end;
