In [1]:
import pandas as pd
import numpy as np 
from fancyimpute import SoftImpute
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv("all_participants_ts_extract_train.csv")
test = pd.read_csv("all_participants_ts_extract_test.csv")

In [3]:
covariates = pd.read_csv("../data/train.csv")

In [4]:
train["id"] = train["kid_id"]
train.drop("kid_id", axis=1, inplace=True)

In [60]:
imputed_columns = train.columns.drop(["id", "index"]).to_list()

In [61]:
joined_df = pd.merge(train, covariates, on="id")

In [62]:
shuffle_df = joined_df.sample(frac=1)

In [63]:
cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season', 
        'FGC-Season', 'BIA-Season', 'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']
pciat = covariates.columns[covariates.columns.str.startswith('PCIAT-PCIAT')].tolist() + ['sii', "PCIAT-Season"]

joined_df_clean = pd.concat([joined_df, pd.get_dummies(joined_df[cat_c]).astype(int)], axis=1)
to_drop = ["id"] + cat_c + ["index"]
joined_df_clean = joined_df_clean.drop(to_drop, axis=1)
joined_df_clean = joined_df_clean.dropna(subset=pciat)
joined_df_clean = joined_df_clean.drop(pciat, axis=1)

In [64]:
soft_impute = SoftImpute(verbose=True)
scaler = StandardScaler()

shuffle_df_scaled = scaler.fit_transform(joined_df_clean)

val_data = shuffle_df_scaled[:200].copy()
shuffle_df_scaled[:200, :len(imputed_columns)-1] = np.nan

imputed_data = soft_impute.fit_transform(shuffle_df_scaled)

pred_val = imputed_data[:200]

[SoftImpute] Max Singular Value of X_init = 99.344095
[SoftImpute] Iter 1: observed MAE=0.040265 rank=117
[SoftImpute] Iter 2: observed MAE=0.040424 rank=117
[SoftImpute] Iter 3: observed MAE=0.040493 rank=116
[SoftImpute] Iter 4: observed MAE=0.040538 rank=116
[SoftImpute] Iter 5: observed MAE=0.040576 rank=116
[SoftImpute] Iter 6: observed MAE=0.040607 rank=116
[SoftImpute] Iter 7: observed MAE=0.040633 rank=116
[SoftImpute] Iter 8: observed MAE=0.040656 rank=116
[SoftImpute] Iter 9: observed MAE=0.040676 rank=116
[SoftImpute] Iter 10: observed MAE=0.040694 rank=116
[SoftImpute] Iter 11: observed MAE=0.040710 rank=116
[SoftImpute] Iter 12: observed MAE=0.040723 rank=116
[SoftImpute] Iter 13: observed MAE=0.040735 rank=116
[SoftImpute] Iter 14: observed MAE=0.040746 rank=116
[SoftImpute] Iter 15: observed MAE=0.040755 rank=116
[SoftImpute] Iter 16: observed MAE=0.040763 rank=116
[SoftImpute] Iter 17: observed MAE=0.040770 rank=116
[SoftImpute] Iter 18: observed MAE=0.040777 rank=116
[

In [65]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

results = []
for i, feature in enumerate(imputed_columns):
    print(f"Feature: {feature}")
    print(f"RMSE: {mean_squared_error(val_data[:, i], pred_val[:, i], squared=False)}")
    print(f"R2: {r2_score(val_data[:, i], pred_val[:, i])}")
    print(f"MAPE: {mean_absolute_percentage_error(val_data[:, i], pred_val[:, i])}")
    print("")
    results.append(pd.DataFrame({ "RMSE": mean_squared_error(val_data[:, i], pred_val[:, i]),
                              "R2": r2_score(val_data[:, i], pred_val[:, i]),
                              "MAPE": mean_absolute_percentage_error(val_data[:, i], pred_val[:, i])},
                                index=[feature]
                              )
                    )
    
results_df = pd.concat(results)

Feature: enmo__sum_values
RMSE: 1.0369886415253375
R2: -0.018429021062556172
MAPE: 1.1870669717698015

Feature: enmo__median
RMSE: 0.41873892804757207
R2: -0.1897806832198219
MAPE: 1.2261997425266202

Feature: enmo__mean
RMSE: 1.009169387451092
R2: -0.0016603429727433117
MAPE: 1.0245988731365367

Feature: enmo__length
RMSE: 0.7766782446373631
R2: 0.12280951950575048
MAPE: 1.1260150294286835

Feature: enmo__standard_deviation
RMSE: 1.0093339422195595
R2: -0.007381160254596564
MAPE: 1.0232578733124453

Feature: enmo__variance
RMSE: 1.0786399466517997
R2: 0.01594975743990512
MAPE: 1.0410115704522405

Feature: enmo__root_mean_square
RMSE: 1.0095887390642981
R2: -0.00727264868580324
MAPE: 1.0211166136725653

Feature: enmo__maximum
RMSE: 0.9832024378204703
R2: -0.021127323002815057
MAPE: 1.0572635147461917

Feature: enmo__absolute_maximum
RMSE: 0.9832024378204702
R2: -0.021127323002814835
MAPE: 1.0572635147461915

Feature: enmo__minimum
RMSE: 0.11806139055545037
R2: -2.894911516487445e+32
MA



In [66]:
results_df

Unnamed: 0,RMSE,R2,MAPE
enmo__sum_values,1.075345,-0.01842902,1.187067
enmo__median,0.175342,-0.1897807,1.2262
enmo__mean,1.018423,-0.001660343,1.024599
enmo__length,0.603229,0.1228095,1.126015
enmo__standard_deviation,1.018755,-0.00738116,1.023258
enmo__variance,1.163464,0.01594976,1.041012
enmo__root_mean_square,1.019269,-0.007272649,1.021117
enmo__maximum,0.966687,-0.02112732,1.057264
enmo__absolute_maximum,0.966687,-0.02112732,1.057264
enmo__minimum,0.013938,-2.894912e+32,2.710005


In [68]:
print("Top 5 features with lowest RMSE")

display(results_df.sort_values("RMSE").head(5))

print("Top 5 features with highest R2")

display(results_df.sort_values("R2", ascending=False).head(5))

print("Top 5 features with lowest MAPE")

display(results_df.sort_values("MAPE").head(5))

Top 5 features with lowest RMSE


Unnamed: 0,RMSE,R2,MAPE
avg_movement_night,0.0,1.0,0.0
enmo__minimum,0.013938,-2.894912e+32,2.710005
enmo__median,0.175342,-0.1897807,1.2262
relative_date_PCIAT__variance,0.195729,-0.04892825,1.104596
relative_date_PCIAT__standard_deviation,0.358618,-0.01012863,1.393073


Top 5 features with highest R2


Unnamed: 0,RMSE,R2,MAPE
avg_movement_night,0.0,1.0,0.0
avg_movement_day,0.59701,0.357946,1.792402
light__length,0.603229,0.12281,1.126015
weekday__length,0.603229,0.12281,1.126015
relative_date_PCIAT__length,0.603229,0.12281,1.126015


Top 5 features with lowest MAPE


Unnamed: 0,RMSE,R2,MAPE
avg_movement_night,0.0,1.0,0.0
light__variance,0.824699,0.112223,0.962345
hour__maximum,0.990269,0.017018,0.982532
hour__absolute_maximum,0.991388,0.016439,0.982727
weekday__standard_deviation,0.512959,-0.050041,0.994766
