In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from IPython.display import clear_output

In [2]:
# Import data
train = pd.read_csv('.././data/train.csv')
test = pd.read_csv('.././data/test.csv')

train['is_train'] = np.ones((train.shape[0], 1))
test['is_train'] = np.zeros((test.shape[0], 1))

In [3]:
print("Train shape: {}".format(train.shape))
print("Test shape: {}".format(test.shape))

Train shape: (10714, 29)
Test shape: (14498, 28)


In [4]:
train.head()

Unnamed: 0,people_ID,Region,Gender,Designation,Name,Married,Children,Occupation,Mode_transport,cases/1M,...,d-dimer,Heart rate,HDL cholesterol,Charlson Index,Blood Glucose,Insurance,salary,FT/month,Infect_Prob,is_train
0,1,Bhubaneshwar,Female,Mrs,mansi,YES,1.0,Farmer,Public,2,...,233.0,82.0,58.0,27,7,3600000.0,1300000,2.0,49.13501,1.0
1,2,Bhubaneshwar,Female,Mrs,riya masi,YES,2.0,Farmer,Walk,2,...,328.0,89.0,68.0,5,6,1600000.0,400000,1.0,51.14788,1.0
2,3,Bhubaneshwar,Female,Mrs,sunita,NO,1.0,Cleaner,Public,2,...,213.0,77.0,43.0,40,6,3400000.0,900000,1.0,73.224,1.0
3,4,Bhubaneshwar,Female,Mrs,anjali @ babli,YES,1.0,Driver,Car,2,...,275.0,64.0,60.0,27,7,700000.0,2300000,1.0,48.779225,1.0
4,5,Bhubaneshwar,Female,Mrs,champa karketta,NO,2.0,Manufacturing,Car,2,...,331.0,71.0,64.0,32,7,3200000.0,1100000,1.0,87.8688,1.0


In [5]:
def fill_categorical_missing(df, cols):
    for col in cols:
        if 'obj' in str(df[col].dtypes):
            df[col].fillna('Unknown', inplace=True)
    return df

cat_features = ['Region', 'Gender', 'Designation', 'Married', 'Occupation',
                'Mode_transport', 'comorbidity', 'Pulmonary score', 'cardiological pressure']
train = fill_categorical_missing(train, cat_features)
test = fill_categorical_missing(test, cat_features)

In [6]:
def modify_cat_features(df_train, df_test, cols):
    concat_df = pd.concat([df_train, df_test], axis=0, sort=True)
    encoder = LabelEncoder()
    for c in cols:
        encoder.fit(concat_df[c].fillna('Unknown'))
        df_train[c] = encoder.transform(df_train[c])
        df_test[c] = encoder.transform(df_test[c])
    return df_train, df_test

train, test = modify_cat_features(train, test, cat_features)

In [7]:
# Preprocessing before iterative imputation
drop_cols = ['people_ID', 'Name', 'Infect_Prob']

def get_encoded_df(df, enc_cols):
    for col in enc_cols:
        df = pd.concat([df, pd.get_dummies(df[col], prefix_sep='_')], axis=1)
        df = df.drop(col, axis=1)
    return df

concat_df = pd.concat([train, test], axis=0, sort=True)
concat_onehot = get_encoded_df(concat_df.drop(drop_cols, axis=1), cat_features)

train_onehot = concat_onehot[concat_onehot['is_train'] == 1]
test_onehot = concat_onehot[concat_onehot['is_train'] == 0]

In [8]:
# Iterative imputation
imputer = IterativeImputer(initial_strategy='median', verbose=1, random_state=42)
train_imputed = imputer.fit_transform(train_onehot)
test_imputed = imputer.transform(test_onehot)

[IterativeImputer] Completing matrix with shape (10714, 61)
[IterativeImputer] Change: 49836.168135083986, scaled tolerance: 5000.0 
[IterativeImputer] Change: 1.767730701696006, scaled tolerance: 5000.0 
[IterativeImputer] Early stopping criterion reached.
[IterativeImputer] Completing matrix with shape (14498, 61)


In [9]:
# Convert imputed arrays to dataframes
train_imputed = pd.DataFrame(train_imputed, columns=train_onehot.columns.tolist())
test_imputed = pd.DataFrame(test_imputed, columns=test_onehot.columns.tolist())

In [12]:
# Replace imputed columns in original dataframe
imp_cols = ['Children', 'Diuresis', 'Platelets', 'HBB', 'd-dimer', 'Heart rate',
            'HDL cholesterol', 'Insurance', 'FT/month']
for col in imp_cols:
    train[col] = train_imputed[col].values
    test[col] = test_imputed[col].values

In [17]:
train.drop('is_train', axis=1).to_csv('.././mod_data/train.csv', index=False)
test.drop('is_train', axis=1).to_csv('.././mod_data/test.csv', index=False)

# Add ID column to imputed dfs
train_imputed['people_ID'] = train['people_ID'].values
test_imputed['people_ID'] = test['people_ID'].values

train_imputed.to_csv('.././mod_data/train_imputed.csv', index=False)
test_imputed.to_csv('.././mod_data/test_imputed.csv', index=False)