In [62]:
# standard libraries
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

# Data imputation
from sklearn.impute import SimpleImputer, KNNImputer

In [63]:
#Simple Imputers
imp_mean = SimpleImputer(strategy="mean")
imp_median = SimpleImputer(strategy="median")
imp_0 = SimpleImputer(strategy="constant", fill_value=0)

#KNN imputers
imp_knn5 = KNNImputer(n_neighbors=5, weights="distance")
imp_knn4 = KNNImputer(n_neighbors=4, weights="distance")
imp_knn3 = KNNImputer(n_neighbors=3, weights="distance")


In [64]:
# Import and display first five rows of advertising dataset
df = pd.read_csv('df_0723.csv')
df.head()

Unnamed: 0,COUNTRY,ID,READING_AVG,MATH_AVG,SCIENCE_AVG,GDP_2018,P_Enrollment_Totals,LS_completion,LS_completion_male,LS_completion_female,...,LS_national_assessment,P_annual_exp,LS_annual_exp,expect_degree,test_anxiety,study_habits,expect_good_grades,home_comp_internet,home_desk_quiet,home_schoolbooks
0,United States,USA,505.0,478,502,62605.592,24957955.89,98.75227,98.901003,98.606098,...,1.0,11727.02375,12693.44975,76.0,67.7,86.9,94.3,85.2,73.3,74.1
1,Russia,USSR-RUS,479.0,488,478,29266.855,6927980.0,99.41,99.63,99.17,...,,,,16.9,51.1,91.8,80.9,94.4,88.2,94.5
2,Belarus,USSR-BLR,474.0,472,471,20003.029,427752.0,99.93871,99.85384,100.0,...,0.0,,,,,,,,,
3,Israel,ISR,470.0,463,462,37972.002,936214.0,98.07,99.36,96.93,...,,7971.310744,,57.0,44.5,76.9,96.0,91.1,90.2,83.1
4,Ukraine,USSR-UKR,466.0,453,469,9283.433,1676550.0,99.45947,99.51151,99.41758,...,1.0,2612.067455,2511.505638,,,,,,,


In [65]:
# specify different types of columns
id_columns = ['COUNTRY','ID']
outcome_columns = ['READING_AVG','MATH_AVG','SCIENCE_AVG']
control_columns = ['GDP_2018']
driver_columns = np.setdiff1d(df.columns.tolist(),id_columns + outcome_columns + control_columns).tolist()

# Imputation

Possible Imputation Guidelines:

        1-10 missing  --> median --- won't change variance in data due to low missing vals
        11-22 missing --> knn (n = 5, weights = 'distance') --- ~50 vals available to impute from
        24-30 missing --> knn (n = 4, weights = 'distance') --- ~40 vals available to impute from
        31+ missing   --> knn (n = 3, weights = 'distance') --- ~30 vals available to impute from

In [88]:
# create a copy for imputation
imp_df = df.copy()

In [89]:
#columns grouped by suggested imputing methods above
median_imp =['READING_AVG','GDP_2018']
knn5_imp = ['LS_completion','LS_completion_male','LS_completion_female','LS_ta_presence', 'P_ta_presence','LS_guard_presence', 'P_sped_ta_presence','LS_sped_ta_presence','salary_per_instruction_hr']
knn4_imp = ['P_national_assessment','LS_national_assessment','P_annual_exp','LS_annual_exp']
knn3_imp = ['expect_degree','test_anxiety','study_habits','expect_good_grades','home_comp_internet','home_desk_quiet','home_schoolbooks','teacher_sent _societal_contribution','teacher_need_multicultural_pd','teacher_PD_12M_Value','principal_PD_12M_Value']

In [68]:
#running median imputation and adding to new df

med_imp = imp_median.fit_transform(df[median_imp])
imp_med = pd.DataFrame(med_imp, columns= median_imp)
imp_df[median_imp] = imp_med

I noticed with the KNN imputing method, only columns listed in transform( ) are used as neighbors. This is why, for every "formula" variable, I added the previous formula's contents (already imputed) plus the new list that will be the focus for imputation.

In [69]:
#based on formula 1 columns, impute kn 5 columns
formula1 = outcome_columns + control_columns+ knn5_imp
knn_imp5 = imp_knn5.fit_transform(imp_df[formula1])
imp_5 = pd.DataFrame(knn_imp5, columns= formula1)
imp_df[formula1] = imp_5

In [70]:
#based on formula 2 columns, impute kn 4 columns
formula2 = outcome_columns + control_columns+ knn5_imp + knn4_imp
knn_imp4 = imp_knn4.fit_transform(imp_df[formula2])
imp_4 = pd.DataFrame(knn_imp4, columns= formula2)
imp_df[formula2] = imp_4

In [71]:
#based on formula 3 columns, impute kn 3 columns
formula3 = outcome_columns + control_columns+ knn5_imp + knn4_imp + knn3_imp
knn_imp3 = imp_knn3.fit_transform(imp_df[formula3])
imp_3 = pd.DataFrame(knn_imp3, columns= formula3)
imp_df[formula3] = imp_3

# Comparing Previous Averages to Imputed Averages

In [87]:
df.describe()

Unnamed: 0,READING_AVG,MATH_AVG,SCIENCE_AVG,GDP_2018,P_Enrollment_Totals,LS_completion,LS_completion_male,LS_completion_female,teacher_sent _societal_contribution,teacher_need_multicultural_pd,...,LS_national_assessment,P_annual_exp,LS_annual_exp,expect_degree,test_anxiety,study_habits,expect_good_grades,home_comp_internet,home_desk_quiet,home_schoolbooks
count,73.0,74.0,74.0,73.0,67.0,57.0,57.0,57.0,39.0,39.0,...,50.0,51.0,48.0,39.0,40.0,40.0,40.0,43.0,43.0,43.0
mean,452.027397,457.459459,456.945946,38152.755137,2672815.0,92.020157,93.126493,90.971811,89.958974,16.892308,...,0.6,6845.181356,7570.797835,45.164103,57.025,81.46,83.975,84.593023,82.916279,83.839535
std,53.168749,57.086138,52.042894,25845.824321,5406551.0,14.755536,14.106214,15.570077,7.326725,10.504971,...,0.494872,3698.441062,4631.277701,17.054101,11.666625,6.98404,10.250572,18.015343,12.500322,7.125445
min,340.0,325.0,336.0,7304.503,26532.0,0.0,0.0,0.0,65.6,3.6,...,0.0,847.356358,847.356358,16.9,33.5,59.8,60.6,19.3,43.2,60.2
25%,412.0,414.75,417.5,19043.317,200158.5,91.193333,92.366667,90.016667,86.15,10.3,...,0.0,3988.708022,3437.641245,33.75,47.725,78.125,77.05,86.2,80.3,80.85
50%,466.0,463.5,466.0,31938.663,514664.0,97.116667,97.816667,96.48,92.3,14.3,...,1.0,6877.31329,6853.625612,42.8,55.8,82.3,84.3,92.0,88.2,85.1
75%,495.0,499.75,495.75,52120.535,2795060.0,98.876667,99.283333,98.98,95.7,20.8,...,1.0,8603.787121,10591.719512,57.7,64.05,86.95,92.9,95.3,91.65,88.3
max,555.0,591.0,590.0,130475.069,29425750.0,100.0,100.0,100.0,98.8,45.9,...,1.0,20892.09391,21123.6185,76.3,81.2,91.8,97.7,98.2,93.2,95.3


In [75]:
imp_df.describe()

Unnamed: 0,READING_AVG,MATH_AVG,SCIENCE_AVG,GDP_2018,P_Enrollment_Totals,LS_completion,LS_completion_male,LS_completion_female,teacher_sent _societal_contribution,teacher_need_multicultural_pd,...,LS_national_assessment,P_annual_exp,LS_annual_exp,expect_degree,test_anxiety,study_habits,expect_good_grades,home_comp_internet,home_desk_quiet,home_schoolbooks
count,74.0,74.0,74.0,74.0,67.0,74.0,74.0,74.0,74.0,74.0,...,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0
mean,452.216216,457.459459,456.945946,38068.780919,2672815.0,92.732094,93.837486,91.691703,91.903585,20.210328,...,0.62719,6390.10545,7705.6869,48.239311,60.339155,82.916624,86.754258,76.628234,77.583955,82.405378
std,52.8283,57.086138,52.042894,25678.350464,5406551.0,13.245104,12.558288,14.097315,6.230617,12.35039,...,0.425206,3832.44778,4914.158757,15.140701,12.348996,6.020553,9.432825,23.274991,15.653052,7.434984
min,340.0,325.0,336.0,7304.503,26532.0,0.0,0.0,0.0,65.6,3.6,...,0.0,847.356358,847.356358,16.9,33.5,59.8,60.6,19.3,43.2,60.2
25%,412.25,414.75,417.5,19151.60875,200158.5,92.594701,92.865833,92.157681,89.055368,10.550521,...,0.0,3125.528937,3205.790739,38.209846,51.525,80.126189,80.039979,58.391431,65.177449,80.039245
50%,466.0,463.5,466.0,31938.663,514664.0,96.960162,97.763034,96.433333,92.867824,14.6,...,0.843054,6068.171296,6681.537449,47.643827,58.267824,84.188758,88.258049,90.530127,84.168453,84.493127
75%,494.5,499.75,495.75,51503.19525,2795060.0,98.72,99.080251,98.754399,97.105882,26.838911,...,1.0,8609.214456,12056.536514,63.048462,71.875,87.175,96.140825,94.35,90.473534,86.575
max,555.0,591.0,590.0,130475.069,29425750.0,100.0,100.0,100.0,98.8,45.9,...,1.0,20892.09391,21123.6185,76.3,81.2,91.8,97.7,98.2,93.2,95.3


In [85]:
imp_df.to_csv('imputed.csv', index = False)