## Missing Value Imputation

The missing value imputation module features a number of different methods. 

For time series, `Interpolation` can work fairly well.

For other numeric data, `KNNImputer` from sklearn can work werll.

`SimpleImputer` will fill missing categorical values with the most frequent label, while `LogisticRegression` will attempt to model missing categorical values based on the complete rows in the dataset.



In [None]:
from dqp.data_loaders import load_fv, load_egm, load_santander_statuses
from dqp import MissingImputationModule
import pandas as pd
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
import warnings
from sklearn.metrics import r2_score
warnings.filterwarnings("ignore")

In [None]:
methods=MissingImputationModule.list_available_methods()
methods

## Numeric methods on FV dataset (has missing values)

In [None]:
data_ecocounter, _, __ = load_fv("./datasets/fv/")
data_ecocounter._df


In [None]:

config = {
    
    'imputation_method':'KNNImputer'
    
    
}
module=MissingImputationModule(**config)
result = module.process(data_ecocounter)
result._df

## EGM - no missing values so we create some

In [None]:
data = load_egm("./datasets/egm/")
print(data._numeric_columns)
real_values = data._df.copy()
#add null values

for col in ['illuminance', 'precipitation', 'irradiance', 'windspeedgust',
       'windspeedavg', 'humidity', 'temperature']:
    
    idx = np.random.random(len(data._df)) < 0.1
    data._df[col][idx]=np.nan
null_values=data._df.copy()
data._df

In [None]:
#Define configuration

config = {
    
    'imputation_method':'Interpolation'
    
    
}
module=MissingImputationModule(**config)
result = module.process(data)
print(result._df.isnull().any())
result._df


In [None]:
#evaluate

for col in ['illuminance', 'precipitation', 'irradiance', 'windspeedgust',
       'windspeedavg', 'humidity', 'temperature']:

    null_idx = null_values[col].isnull()
    print(col, 'r2_score', r2_score(real_values[col][null_idx], result._df[col][null_idx]))
 


### try again for KNN Imputer

In [None]:
data = load_egm("./datasets/egm/")
print(data._numeric_columns)
real_values = data._df.copy()
#add null values

for col in ['illuminance', 'precipitation', 'irradiance', 'windspeedgust',
       'windspeedavg', 'humidity', 'temperature']:
    
    idx = np.random.random(len(data._df)) < 0.1
    data._df[col][idx]=np.nan
null_values=data._df.copy()
data._df

In [None]:
config = {
    
    'imputation_method':'KNNImputer'
    
    
}
module=MissingImputationModule(**config)
result = module.process(data)
result._df

In [None]:
#evaluate

for col in ['illuminance', 'precipitation', 'irradiance', 'windspeedgust',
       'windspeedavg', 'humidity', 'temperature']:

    null_idx = null_values[col].isnull()
    print(col, 'r2_score', r2_score(real_values[col][null_idx], result._df[col][null_idx]))
 


## Categorical Imputation - Santander

In [None]:
data=load_santander_statuses("./datasets/uc/dataset_SDR_example.jsonld")
# data._df['battery']=data._df['battery'].astype(data._df.dtypes['battery'])

In [None]:
for col in ['id']:
    
    idx = np.random.random(len(data._df)) < 0.1
    data._df[col][idx]=np.nan
    
    #Define configuration

data._df

In [None]:

config = {
    
    'imputation_method':'SimpleImputer'
    
    
}
module=MissingImputationModule(**config)
result = module.process(data)
print(result._df.isnull().any())
result._df