# Data imputation
[TOC]

In [1]:
import os
import pandas as pd
import numpy as np
from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, IterativeImputer, BiScaler

from data import feature, get_severity, z_score_normalize, z_score_denormalize, fill_dataset

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Read data

In [2]:
train_data_path = './trainSet/trainSet.txt'
test_data_path = './testSet/testSet.txt'

In [3]:
train_dataset = pd.read_csv(train_data_path, sep=',')
train_dataset['severity'] = train_dataset['Prognosis'].map(get_severity)
test_dataset = pd.read_csv(test_data_path, sep=',')

y = np.array(train_dataset['severity'])

## Imputation
### Impute testing dataset
- Filled NaN data using [method] based on concatenation of trainSet.txt & testSet.txt.
- There are columns filled with **NaN** so we need **concatenation** of trainSet.txt & testSet.txt, otherwise nothing meaningful imputation can be obtained.

In [4]:
concat_feature = pd.concat([test_dataset[feature], train_dataset[feature]])
test_dataset_len = len(test_dataset)
train_dataset_len = len(train_dataset)

In [5]:
X_incomplete = concat_feature
X_incomplete = np.array(X_incomplete)
X_incomplete = z_score_normalize(X_incomplete)

X_filled_knn_3 = KNN(k=3).fit_transform(X_incomplete)
X_filled_knn_6 = KNN(k=6).fit_transform(X_incomplete)
X_filled_knn_9 = KNN(k=9).fit_transform(X_incomplete)

# Not suitable for large scale matrix
# X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete)

X_incomplete_normalized = BiScaler().fit_transform(X_incomplete)
X_filled_softimpute = SoftImpute().fit_transform(X_incomplete_normalized)

test_dataset_filled_knn_3 = fill_dataset(test_dataset, feature, z_score_denormalize(X_filled_knn_3[:test_dataset_len]), './csv/filled_testSet/filled_knn_3_testSet_binary.txt')
test_dataset_filled_knn_6 = fill_dataset(test_dataset, feature, z_score_denormalize(X_filled_knn_6[:test_dataset_len]), './csv/filled_testSet/filled_knn_6_testSet_binary.txt')
test_dataset_filled_knn_9 = fill_dataset(test_dataset, feature, z_score_denormalize(X_filled_knn_9[:test_dataset_len]), './csv/filled_testSet/filled_knn_9_testSet_binary.txt')
test_dataset_filled_softimpute = fill_dataset(test_dataset, feature, z_score_denormalize(X_filled_softimpute[:test_dataset_len]), './csv/filled_testSet/filled_softimpute_testSet_binary.txt')

Imputing row 1/983 with 4 missing, elapsed time: 0.173
Imputing row 101/983 with 9 missing, elapsed time: 0.183
Imputing row 201/983 with 4 missing, elapsed time: 0.189
Imputing row 301/983 with 6 missing, elapsed time: 0.194
Imputing row 401/983 with 3 missing, elapsed time: 0.199
Imputing row 501/983 with 1 missing, elapsed time: 0.204
Imputing row 601/983 with 3 missing, elapsed time: 0.210
Imputing row 701/983 with 4 missing, elapsed time: 0.214
Imputing row 801/983 with 3 missing, elapsed time: 0.220
Imputing row 901/983 with 1 missing, elapsed time: 0.225
Imputing row 1/983 with 4 missing, elapsed time: 0.168
Imputing row 101/983 with 9 missing, elapsed time: 0.178
Imputing row 201/983 with 4 missing, elapsed time: 0.183
Imputing row 301/983 with 6 missing, elapsed time: 0.187
Imputing row 401/983 with 3 missing, elapsed time: 0.193
Imputing row 501/983 with 1 missing, elapsed time: 0.197
Imputing row 601/983 with 3 missing, elapsed time: 0.203
Imputing row 701/983 with 4 missing

[SoftImpute] Iter 89: observed MAE=0.023579 rank=15
[SoftImpute] Iter 90: observed MAE=0.023580 rank=15
[SoftImpute] Iter 91: observed MAE=0.023580 rank=15
[SoftImpute] Iter 92: observed MAE=0.023580 rank=15
[SoftImpute] Iter 93: observed MAE=0.023580 rank=15
[SoftImpute] Iter 94: observed MAE=0.023581 rank=15
[SoftImpute] Iter 95: observed MAE=0.023581 rank=15
[SoftImpute] Iter 96: observed MAE=0.023581 rank=15
[SoftImpute] Iter 97: observed MAE=0.023581 rank=15
[SoftImpute] Iter 98: observed MAE=0.023582 rank=15
[SoftImpute] Iter 99: observed MAE=0.023582 rank=15
[SoftImpute] Iter 100: observed MAE=0.023582 rank=15
[SoftImpute] Stopped after iteration 100 for lambda=0.825994


### Impute training dataset

In [6]:
X_incomplete = train_dataset[feature]
X_incomplete = np.array(X_incomplete)
X_incomplete = z_score_normalize(X_incomplete)
X_filled_knn_3 = KNN(k=3).fit_transform(X_incomplete)
X_filled_knn_6 = KNN(k=6).fit_transform(X_incomplete)
X_filled_knn_9 = KNN(k=9).fit_transform(X_incomplete)

# Not suitable for large scale matrix
# X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete)

X_incomplete_normalized = BiScaler().fit_transform(X_incomplete)
X_filled_softimpute = SoftImpute().fit_transform(X_incomplete_normalized)

fill_dataset(train_dataset, feature, z_score_denormalize(X_filled_knn_3), './csv/filled_trainSet/filled_knn_3_trainSet_binary.txt')
fill_dataset(train_dataset, feature, z_score_denormalize(X_filled_knn_6), './csv/filled_trainSet/filled_knn_6_trainSet_binary.txt')
fill_dataset(train_dataset, feature, z_score_denormalize(X_filled_knn_9), './csv/filled_trainSet/filled_knn_9_trainSet_binary.txt')
fill_dataset(train_dataset, feature, z_score_denormalize(X_filled_softimpute), './csv/filled_trainSet/filled_softimpute_trainSet_binary.txt')

Imputing row 1/863 with 4 missing, elapsed time: 0.127
Imputing row 101/863 with 1 missing, elapsed time: 0.131
Imputing row 201/863 with 2 missing, elapsed time: 0.136
Imputing row 301/863 with 3 missing, elapsed time: 0.142
Imputing row 401/863 with 5 missing, elapsed time: 0.147
Imputing row 501/863 with 3 missing, elapsed time: 0.152
Imputing row 601/863 with 4 missing, elapsed time: 0.156
Imputing row 701/863 with 2 missing, elapsed time: 0.162
Imputing row 801/863 with 2 missing, elapsed time: 0.166
Imputing row 1/863 with 4 missing, elapsed time: 0.123
Imputing row 101/863 with 1 missing, elapsed time: 0.127
Imputing row 201/863 with 2 missing, elapsed time: 0.132
Imputing row 301/863 with 3 missing, elapsed time: 0.137
Imputing row 401/863 with 5 missing, elapsed time: 0.142
Imputing row 501/863 with 3 missing, elapsed time: 0.147
Imputing row 601/863 with 4 missing, elapsed time: 0.151
Imputing row 701/863 with 2 missing, elapsed time: 0.156
Imputing row 801/863 with 2 missing

Unnamed: 0,PatientID,ImageFile,Hospital,Age,Sex,Temp_C,Cough,DifficultyInBreathing,WBC,CRP,...,LDH,Ddimer,Ox_percentage,PaO2,SaO2,pH,CardiovascularDisease,RespiratoryFailure,Prognosis,severity
0,P_131,P_131.png,D,33.195797,0,39.540229,1,0,6.093142,69.719726,...,496.069339,449.448694,93.985011,69.231114,93.476472,7.454321,0,0,MILD,0
1,P_132,P_132.png,D,56.532254,0,36.868540,0,0,15.473850,112.445336,...,377.282076,829.206335,93.716961,80.397610,100.903407,7.387528,0,0,MILD,0
2,P_195,P_195.png,D,78.127837,0,37.609290,1,0,5.578155,166.274022,...,317.456699,-3524.076584,91.636093,53.125551,92.741772,7.333317,1,0,SEVERE,1
3,P_193,P_193.png,D,79.495954,0,37.647242,1,0,6.152517,221.024658,...,487.694021,2065.672709,94.060177,52.317993,93.319404,7.423369,0,0,SEVERE,1
4,P_140,P_140.png,D,60.099710,1,36.706426,1,0,6.588651,10.085499,...,293.305424,-1711.130157,91.842556,81.080565,103.162470,7.444999,0,0,MILD,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
858,P_1_12,P_1_12.png,F,50.906657,0,37.631982,0,1,15.758303,11.591722,...,364.576616,10088.772403,92.775037,74.781216,91.190706,7.452693,0,0,SEVERE,1
859,P_1_8,P_1_8.png,F,56.559496,0,37.685975,1,0,4.728313,-4.753165,...,673.438149,2403.719977,92.427919,69.328531,92.344096,7.451632,0,0,SEVERE,1
860,P_1_10,P_1_10.png,F,38.587343,0,37.559318,0,1,9.784796,14.395177,...,413.931871,8779.018952,93.537639,74.145853,91.578637,7.449117,0,0,MILD,0
861,P_1_26,P_1_26.png,F,86.273524,1,37.796539,0,1,13.986624,-13.617817,...,345.121328,2402.728337,91.524438,73.120925,91.285873,7.325461,0,0,SEVERE,1
