In [1]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_csv(Path("adults_data","adults_cleaned_train_data.csv"))
df_test = pd.read_csv(Path("adults_data","adults_cleaned_test_data.csv"))

In [3]:
def data_transformation_for_KNN(data_to_trans):

    data = data_to_trans.copy()
    data['income'] = data['income'].replace({"<=50K":0,
                                            ">50K":1})
    
    pd.set_option('future.no_silent_downcasting', True)
    data = data.infer_objects(copy=False)
    
    to_compare = data.copy()
    occupation_indices= data[data['occupation'].isnull()].index
    region_indices = data[data['region'].isnull()].index    
    
    encoder_dict = {}

    for column in data.columns:
        if data[column].dtype in ["object","category"]:
            encoder = LabelEncoder()
            encoder_dict[column] = encoder
            data[column] = encoder.fit_transform(data[column])
    miss_occ_as_nr = data.loc[occupation_indices[0],"occupation"]
    miss_reg_as_nr = data.loc[region_indices[0],"region"]
    data['occupation'] = data['occupation'].replace({miss_occ_as_nr:None})
    data['region'] = data['region'].replace({miss_reg_as_nr:None})
    
    X = data.iloc[:,:-1]
    Y = data.iloc[:,-1]
    
    return X, Y, encoder_dict, to_compare

In [4]:
def KNN_imputation(data_to_impute):
    
    data = data_to_impute.copy()
    
    KNN_X, KNN_Y, encoders, to_compare = data_transformation_for_KNN(data)
    
    imputer = KNNImputer(n_neighbors=5, weights="distance", metric="nan_euclidean")
                                
    imputer.fit(KNN_X)
    X_transformed = pd.DataFrame(imputer.transform(KNN_X),
                                 columns=KNN_X.columns)
    
    for column in encoders.keys():
        X_transformed[column] = encoders[column].inverse_transform(X_transformed[column].astype(int))
    
    X_transformed.index = KNN_X.index
    return X_transformed, to_compare
    

### train imputation

In [5]:
dencoded, org = KNN_imputation(df_train) 

dencoded['income'] = org['income']
dencoded.to_csv(Path("adults_data","adults_imputed_train_data.csv"), index=None)

  data['income'] = data['income'].replace({"<=50K":0,


### test imputation

In [6]:
total_df = pd.concat([dencoded, df_test], ignore_index=True)
dencoded_test, org_test = KNN_imputation(total_df) 

dencoded_test['income'] = org_test['income']
dencoded_test.to_csv(Path("adults_data","adults_imputed_test_data.csv"), index=None)

In [7]:
(dencoded_test != org_test).sum()

age                 0
workclass           0
education-num       0
marital-status      0
occupation          1
relationship        0
race                0
sex                 0
capital             0
hours-per-week      0
region            192
income              0
dtype: int64

In [8]:
df_test.isnull().sum()

age                 0
workclass           0
education-num       0
marital-status      0
occupation          1
relationship        0
race                0
sex                 0
capital             0
hours-per-week      0
region            192
income              0
dtype: int64