In [10]:
using DataFrames
using CSV
using DataFrames

# Load the CSV files into data frames
train = CSV.read("../../../../data/train.csv", DataFrame)
test = CSV.read("../../../../data/test.csv", DataFrame)
data_dict = CSV.read("../../../../data/data_dictionary.csv", DataFrame)

# Display the data
first(train, 5)
first(test, 5)
first(data_dict, 5)

Row,Instrument,Field,Description,Type,Values,Value Labels
Unnamed: 0_level_1,String,String,String,String15,String31?,String?
1,Identifier,id,Participant's ID,str,missing,missing
2,Demographics,Basic_Demos-Enroll_Season,Season of enrollment,str,"Spring, Summer, Fall, Winter",missing
3,Demographics,Basic_Demos-Age,Age of participant,float,missing,missing
4,Demographics,Basic_Demos-Sex,Sex of participant,categorical int,01,"0=Male, 1=Female"
5,Children's Global Assessment Scale,CGAS-Season,Season of participation,str,"Spring, Summer, Fall, Winter",missing


In [11]:
# Specify the categorical columns
cat_c = [
    "Basic_Demos-Enroll_Season", "CGAS-Season", "Physical-Season", "Fitness_Endurance-Season", 
    "FGC-Season", "BIA-Season", "PAQ_A-Season", "PAQ_C-Season", "SDS-Season", 
    "PreInt_EduHx-Season", "id"
]

# Identify columns that start with "PCIAT.PCIAT" and other specified columns
pciat = filter(name -> startswith(name, "PCIAT-PCIAT"), names(train))
append!(pciat, ["sii", "PCIAT-Season"])

# Get numeric columns to filter
columns_float = setdiff(names(train), cat_c ∪ pciat)

# Filter the numeric columns and convert them to numeric types
train_filtered = train[:, columns_float]
for col in names(train_filtered)
    train_filtered[!, col] = map(x -> 
        try 
            parse(Float64, string(x)) 
        catch 
            missing 
        end, train_filtered[!, col])
end

In [3]:
lnr = IAI.OptKNNImputationLearner(method = "opt_knn", random_seed=12)

Unfitted OptKNNImputationLearner:
  random_seed: 12

In [4]:
IAI.fit!(lnr, train_filtered)

[33m[1m└ [22m[39mbed1c049b33cf139e0b18189fbbed293e73573685741fed0de8b4df77326d308


Fitted OptKNNImputationLearner

In [5]:
completed_data = IAI.transform(lnr, train_filtered)

Row,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday
Unnamed: 0_level_1,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?
1,5.0,0.0,51.0,16.8773,46.0,50.8,23.4477,64.684,80.0897,106.879,4.36942,5.79101,40.6556,0.0,0.0,13.9842,1.12079,13.5432,1.0,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,2.59517,2.84191,42.5516,59.8223,3.0
2,9.0,0.0,56.2841,14.0356,48.0,46.0,22.0,75.0,70.0,122.0,4.79128,7.185,25.985,3.0,0.0,14.453,1.74325,15.3152,1.80235,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,1.65245,2.34,46.0,64.0,0.0
3,10.0,1.0,71.0,16.6487,56.5,75.6,22.9491,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,2.56896,4.35725,17.9353,1052.82,1703.8,16.3745,54.4027,14.0955,3.83976,15.5481,1.3841,24.951,13.0773,50.0455,23.5917,41.3255,2.0932,2.17,38.0,54.0,2.0
4,9.0,0.0,71.0,18.2923,56.0,81.6,25.2585,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,21.2748,1.97918,22.3619,1.99982,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,2.13632,2.451,31.0,45.0,0.0
5,18.0,1.0,73.9669,23.438,62.0099,128.876,31.0014,72.2672,74.9439,121.168,3.4803,5.76835,42.8226,9.61471,0.184677,16.4854,1.26857,17.9177,1.99886,0.784267,0.103883,10.5618,0.831506,13.0991,0.305549,11.4456,0.919319,2.72381,3.91678,23.8634,1218.79,1912.31,21.0992,72.0804,14.2595,9.60396,46.868,1.8118,29.5601,21.4212,68.1637,40.617,50.6593,1.04,2.90203,31.8138,45.628,1.97489
6,13.0,1.0,50.0,22.28,59.5,112.2,26.1566,60.0,73.0,102.0,3.69355,5.47561,18.7148,12.0,0.0,16.5,2.0,17.9,2.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,1.80151,4.11,40.0,56.0,0.0
7,10.0,0.0,59.8766,19.6608,55.0,84.6,28.911,123.0,83.0,163.0,4.83017,7.15447,24.3872,9.0,1.0,21.5107,1.79969,22.7108,1.9118,2.0,0.0,11.0,1.0,11.0,1.0,11.0,1.0,2.0,3.78271,19.6629,1135.86,1817.38,16.3275,63.247,14.7,4.96291,21.353,2.0,30.8936,16.0259,59.4643,26.1957,47.2211,2.13685,3.67,27.0,40.0,3.0
8,10.0,1.0,61.426,16.8613,59.25,84.2,27.0,71.0,90.0,116.0,3.87483,5.09496,29.5222,0.0,0.0,12.6,2.0,11.1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0,4.05726,16.8631,1180.04,1888.06,21.94,67.9527,13.6092,3.25395,16.2474,2.0,28.5367,17.476,63.8954,28.768,50.4767,2.06883,1.27,40.9721,57.641,2.0
9,15.0,0.0,64.0644,17.9962,57.1113,85.6728,27.1565,73.5224,79.5182,120.296,2.99511,3.6296,38.1049,5.54743,0.0,20.8597,1.18584,22.1611,1.09235,2.31969,0.0,9.0446,1.0,6.21865,0.0,10.7207,1.0,2.01958,4.12068,18.8231,1187.7,1856.93,17.4061,68.7687,14.5916,4.23146,20.1211,1.85639,32.8862,18.4764,64.6481,31.4849,50.2923,3.10469,2.72699,37.7016,53.4562,2.0
10,19.0,1.0,64.2119,32.3067,63.9324,188.217,41.9932,79.9894,77.4237,133.089,4.43091,6.45305,30.169,9.47749,0.392879,20.3966,1.8711,21.5852,1.95536,3.84716,0.261291,9.22173,0.653574,9.57955,0.56948,9.44452,0.819346,2.07675,4.59472,31.5405,1474.09,2224.81,34.7532,99.2732,17.3405,14.2,80.8996,3.0,38.6657,25.8544,94.6786,52.4025,73.4189,1.58479,2.11712,52.2626,72.6418,3.0


In [6]:
append!(cat_c, pciat)

34-element Vector{String}:
 "Basic_Demos-Enroll_Season"
 "CGAS-Season"
 "Physical-Season"
 "Fitness_Endurance-Season"
 "FGC-Season"
 "BIA-Season"
 "PAQ_A-Season"
 "PAQ_C-Season"
 "SDS-Season"
 "PreInt_EduHx-Season"
 "id"
 "PCIAT-PCIAT_01"
 "PCIAT-PCIAT_02"
 ⋮
 "PCIAT-PCIAT_12"
 "PCIAT-PCIAT_13"
 "PCIAT-PCIAT_14"
 "PCIAT-PCIAT_15"
 "PCIAT-PCIAT_16"
 "PCIAT-PCIAT_17"
 "PCIAT-PCIAT_18"
 "PCIAT-PCIAT_19"
 "PCIAT-PCIAT_20"
 "PCIAT-PCIAT_Total"
 "sii"
 "PCIAT-Season"

In [7]:
size(cat_c)

(34,)

In [8]:
# Combine the imputed data with the removed columns
train_final = hcat(completed_data, train[:, cat_c])

# Reorder columns to match the original dataset order
train_final = train_final[:, names(train)]

Row,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,PCIAT-Season,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
Unnamed: 0_level_1,String15,String7,Float64?,Float64?,String7?,Float64?,String7?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,String7?,Float64?,Float64?,Float64?,String7?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,String7?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,String7?,Float64?,String7?,Float64?,String7?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,String7?,Float64?,Float64?,String7?,Float64?,Int64?
1,00008ff9,Fall,5.0,0.0,Winter,51.0,Fall,16.8773,46.0,50.8,23.4477,64.684,80.0897,106.879,missing,4.36942,5.79101,40.6556,Fall,0.0,0.0,13.9842,1.12079,13.5432,1.0,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,Fall,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,missing,2.59517,missing,2.84191,Fall,5,4,4,0,4,0,0,4,0,0,4,0,4,4,4,4,4,4,2,4,55,missing,42.5516,59.8223,Fall,3.0,2
2,000fd460,Summer,9.0,0.0,missing,56.2841,Fall,14.0356,48.0,46.0,22.0,75.0,70.0,122.0,missing,4.79128,7.185,25.985,Fall,3.0,0.0,14.453,1.74325,15.3152,1.80235,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,Winter,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,missing,1.65245,Fall,2.34,Fall,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fall,46.0,64.0,Summer,0.0,0
3,00105258,Summer,10.0,1.0,Fall,71.0,Fall,16.6487,56.5,75.6,22.9491,65.0,94.0,117.0,Fall,5.0,7.0,33.0,Fall,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,missing,2.56896,4.35725,17.9353,1052.82,1703.8,16.3745,54.4027,14.0955,3.83976,15.5481,1.3841,24.951,13.0773,50.0455,23.5917,41.3255,missing,2.0932,Summer,2.17,Fall,5,2,2,1,2,1,1,2,1,1,1,0,1,1,1,0,2,2,1,1,28,Fall,38.0,54.0,Summer,2.0,0
4,00115b9f,Winter,9.0,0.0,Fall,71.0,Summer,18.2923,56.0,81.6,25.2585,60.0,97.0,117.0,Summer,6.0,9.0,37.0,Summer,18.0,1.0,21.2748,1.97918,22.3619,1.99982,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,Summer,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,missing,2.13632,Winter,2.451,Summer,4,2,4,0,5,1,0,3,2,2,3,0,3,0,0,3,4,3,4,1,44,Summer,31.0,45.0,Winter,0.0,1
5,0016bb22,Spring,18.0,1.0,Summer,73.9669,missing,23.438,62.0099,128.876,31.0014,72.2672,74.9439,121.168,missing,3.4803,5.76835,42.8226,missing,9.61471,0.184677,16.4854,1.26857,17.9177,1.99886,0.784267,0.103883,10.5618,0.831506,13.0991,0.305549,11.4456,0.919319,missing,2.72381,3.91678,23.8634,1218.79,1912.31,21.0992,72.0804,14.2595,9.60396,46.868,1.8118,29.5601,21.4212,68.1637,40.617,50.6593,Summer,1.04,missing,2.90203,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,31.8138,45.628,missing,1.97489,missing
6,001f3379,Spring,13.0,1.0,Winter,50.0,Summer,22.28,59.5,112.2,26.1566,60.0,73.0,102.0,missing,3.69355,5.47561,18.7148,Summer,12.0,0.0,16.5,2.0,17.9,2.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,Summer,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,missing,1.80151,Spring,4.11,Summer,3,3,3,0,2,1,0,2,2,1,0,1,3,3,2,1,3,1,2,1,34,Summer,40.0,56.0,Spring,0.0,1
7,0038ba98,Fall,10.0,0.0,missing,59.8766,Fall,19.6608,55.0,84.6,28.911,123.0,83.0,163.0,missing,4.83017,7.15447,24.3872,Fall,9.0,1.0,21.5107,1.79969,22.7108,1.9118,2.0,0.0,11.0,1.0,11.0,1.0,11.0,1.0,Fall,2.0,3.78271,19.6629,1135.86,1817.38,16.3275,63.247,14.7,4.96291,21.353,2.0,30.8936,16.0259,59.4643,26.1957,47.2211,missing,2.13685,Winter,3.67,Winter,1,4,1,0,2,1,0,1,0,0,0,0,0,0,0,4,1,4,1,0,20,Winter,27.0,40.0,Fall,3.0,0
8,0068a485,Fall,10.0,1.0,missing,61.426,Fall,16.8613,59.25,84.2,27.0,71.0,90.0,116.0,missing,3.87483,5.09496,29.5222,Fall,0.0,0.0,12.6,2.0,11.1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,Fall,3.0,4.05726,16.8631,1180.04,1888.06,21.94,67.9527,13.6092,3.25395,16.2474,2.0,28.5367,17.476,63.8954,28.768,50.4767,missing,2.06883,Fall,1.27,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,40.9721,57.641,Fall,2.0,missing
9,0069fbed,Summer,15.0,0.0,missing,64.0644,Spring,17.9962,57.1113,85.6728,27.1565,73.5224,79.5182,120.296,missing,2.99511,3.6296,38.1049,Spring,5.54743,0.0,20.8597,1.18584,22.1611,1.09235,2.31969,0.0,9.0446,1.0,6.21865,0.0,10.7207,1.0,missing,2.01958,4.12068,18.8231,1187.7,1856.93,17.4061,68.7687,14.5916,4.23146,20.1211,1.85639,32.8862,18.4764,64.6481,31.4849,50.2923,missing,3.10469,missing,2.72699,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,37.7016,53.4562,Summer,2.0,missing
10,0083e397,Summer,19.0,1.0,Summer,64.2119,missing,32.3067,63.9324,188.217,41.9932,79.9894,77.4237,133.089,missing,4.43091,6.45305,30.169,missing,9.47749,0.392879,20.3966,1.8711,21.5852,1.95536,3.84716,0.261291,9.22173,0.653574,9.57955,0.56948,9.44452,0.819346,missing,2.07675,4.59472,31.5405,1474.09,2224.81,34.7532,99.2732,17.3405,14.2,80.8996,3.0,38.6657,25.8544,94.6786,52.4025,73.4189,missing,1.58479,missing,2.11712,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,52.2626,72.6418,missing,3.0,missing


In [9]:

# Save the final DataFrame to a CSV file
CSV.write("optimal_imputed_train.csv", train_final)

"optimal_imputed_train.csv"