# Optimal KNN Imputation

## Read train-test files

In [None]:
using DataFrames
using CSV
using DataFrames
using StatsBase

# Load the CSV files into data frames
train = CSV.read("../data/train.csv", DataFrame)

# Display the data
first(train, 5)
first(test, 5)
first(data_dict, 5)

## Preprocess data

In [7]:
# Specify the categorical columns
cat_c = ["Basic_Demos-Enroll_Season", "CGAS-Season", "Physical-Season", "Fitness_Endurance-Season", 
           "FGC-Season", "BIA-Season", "PAQ_A-Season", "PAQ_C-Season", "SDS-Season", "PreInt_EduHx-Season",
            "Basic_Demos-Sex", "FGC-FGC_CU_Zone", "FGC-FGC_GSND_Zone", "FGC-FGC_GSD_Zone",
           "FGC-FGC_PU_Zone", "FGC-FGC_SRL_Zone", "FGC-FGC_SRR_Zone", "FGC-FGC_TL_Zone", "BIA-BIA_Activity_Level_num",
           "BIA-BIA_Frame_num"]

id_col = ["id"]

pciat = filter(name -> startswith(name, "PCIAT-PCIAT"), names(train))
append!(pciat, ["sii", "PCIAT-Season"])

columns_float = setdiff(names(train), cat_c ∪ pciat ∪ id_col)

# Filter the numeric columns and convert them to numeric types
train_filtered_float = train[:, columns_float]
train_filtered_cat = train[:, cat_c]

for col in names(train_filtered_float)
    train_filtered_float[!, col] = map(x -> 
        try 
            parse(Float64, string(x)) 
        catch 
            missing 
        end, train_filtered_float[!, col])
end

## Impute and save

In [10]:
function calculate_mode(column)
    nonmissing_values = skipmissing(column)
    return mode(nonmissing_values)
end

for col in names(train_filtered_cat)
    column = train_filtered_cat[!, col]
    mode_val = calculate_mode(column)
    train_filtered_cat[!, col] = map(x -> ismissing(x) ? mode_val : x, column)
end

In [None]:
lnr = IAI.OptKNNImputationLearner(method = "opt_knn", random_seed=12)

In [None]:
IAI.fit!(lnr, train_filtered_float)

In [None]:
completed_data = IAI.transform(lnr, train_filtered_float)

In [None]:
append!(cat_c, pciat, id_col)

In [None]:
# Combine the imputed data with the removed columns
train_final = hcat(completed_data, train[:, cat_c])
# Reorder columns to match the original dataset order
train_final = train_final[:, names(train)]

In [None]:

# Save the final DataFrame to a CSV file
CSV.write("../training_sets/imputed_train_optimal_knn.csv", train_final)