# Why should you care?

1. It **increases** baseline score of LGBM

I was suprised too. LGBM and other boosting methods do handle missing values efficently (LGBM removes them and then imputes them with values that minimize the loss.)  In my case it was KNN with optimised k that beat the baseline

2. To solve imbalance problems, apply NN etc etc one should take care of missing values effectively, hence we do need to impute efficiently.


Problems: Computational time, our data set is huuuge, 500k rows and for most of them executing the script on the server is not feasible. Do it locally, and you will see difference in CV between baseline and some methods.

Things I tried:
1. deletion
2. mean 
3. MICE
4. knn (different optimal k)
5. Softimpute
6. Expectation minimization
7. moving window
8. Spline, barycentric etc imputation of df columns.




In [None]:
import pandas as pd
import numpy as np
import multiprocessing
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
from time import time
import datetime
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
warnings.simplefilter('ignore')
sns.set()
%matplotlib inline
from fancyimpute import KNN
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer

In [None]:
files = ['../input/test_identity.csv', 
         '../input/test_transaction.csv',
         '../input/train_identity.csv',
         '../input/train_transaction.csv',
         '../input/sample_submission.csv']

In [None]:
%%time
def load_data(file):
    return pd.read_csv(file)

with multiprocessing.Pool() as pool:
    test_id, test_tr, train_id, train_tr, sub = pool.map(load_data, files)

In [None]:
train = pd.merge(train_tr, train_id, on='TransactionID', how='left')
test = pd.merge(test_tr, test_id, on='TransactionID', how='left')

del test_id, test_tr, train_id, train_tr
gc.collect()

In [None]:
useful_features = ['TransactionAmt', 'ProductCD',"card1", 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1',
                   'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13',
                   'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M2', 'M3',
                   'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V17',
                   'V19', 'V20', 'V29', 'V30', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V40', 'V44', 'V45', 'V46', 'V47', 'V48',
                   'V49', 'V51', 'V52', 'V53', 'V54', 'V56', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V69', 'V70', 'V71',
                   'V72', 'V73', 'V74', 'V75', 'V76', 'V78', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V87', 'V90', 'V91', 'V92',
                   'V93', 'V94', 'V95', 'V96', 'V97', 'V99', 'V100', 'V126', 'V127', 'V128', 'V130', 'V131', 'V138', 'V139', 'V140',
                   'V143', 'V145', 'V146', 'V147', 'V149', 'V150', 'V151', 'V152', 'V154', 'V156', 'V158', 'V159', 'V160', 'V161',
                   'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V169', 'V170', 'V171', 'V172', 'V173', 'V175', 'V176', 'V177',
                   'V178', 'V180', 'V182', 'V184', 'V187', 'V188', 'V189', 'V195', 'V197', 'V200', 'V201', 'V202', 'V203', 'V204',
                   'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V219', 'V220',
                   'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229', 'V231', 'V233', 'V234', 'V238', 'V239',
                   'V242', 'V243', 'V244', 'V245', 'V246', 'V247', 'V249', 'V251', 'V253', 'V256', 'V257', 'V258', 'V259', 'V261',
                   'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V270', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276',
                   'V277', 'V278', 'V279', 'V280', 'V282', 'V283', 'V285', 'V287', 'V288', 'V289', 'V291', 'V292', 'V294', 'V303',
                   'V304', 'V306', 'V307', 'V308', 'V310', 'V312', 'V313', 'V314', 'V315', 'V317', 'V322', 'V323', 'V324', 'V326',
                   'V329', 'V331', 'V332', 'V333', 'V335', 'V336', 'V338', 'id_01', 'id_02', 'id_03', 'id_05', 'id_06', 'id_09',
                   'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_17', 'id_19', 'id_20', 'id_30', 'id_31', 'id_32', 'id_33',
                   'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']

In [None]:
cols_to_drop = [col for col in train.columns if col not in useful_features]
cols_to_drop.remove('isFraud')
cols_to_drop.remove('TransactionID')
cols_to_drop.remove('TransactionDT')

In [None]:
print('{} features are going to be dropped for being useless'.format(len(cols_to_drop)))

train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

In [None]:
# New feature - decimal part of the transaction amount
train['TransactionAmt_decimal'] = ((train['TransactionAmt'] - train['TransactionAmt'].astype(int)) * 1000).astype(int)
test['TransactionAmt_decimal'] = ((test['TransactionAmt'] - test['TransactionAmt'].astype(int)) * 1000).astype(int)

# Count encoding for card1 feature. 
# Explained in this kernel: https://www.kaggle.com/nroman/eda-for-cis-fraud-detection
train['card1_count_full'] = train['card1'].map(pd.concat([train['card1'], test['card1']], ignore_index=True).value_counts(dropna=False))
test['card1_count_full'] = test['card1'].map(pd.concat([train['card1'], test['card1']], ignore_index=True).value_counts(dropna=False))

# https://www.kaggle.com/fchmiel/day-and-time-powerful-predictive-feature
train['Transaction_day_of_week'] = np.floor((train['TransactionDT'] / (3600 * 24) - 1) % 7)
test['Transaction_day_of_week'] = np.floor((test['TransactionDT'] / (3600 * 24) - 1) % 7)
train['Transaction_hour'] = np.floor(train['TransactionDT'] / 3600) % 24
test['Transaction_hour'] = np.floor(test['TransactionDT'] / 3600) % 24

# Some arbitrary features interaction
for feature in ['id_02__id_20', 'id_02__D8', 'D11__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__C2', 
                'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']:


    f1, f2 = feature.split('__')
    train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
    test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)

    le = LabelEncoder()
    le.fit(list(train[feature].astype(str).values) + list(test[feature].astype(str).values))
    train[feature] = le.transform(list(train[feature].astype(str).values))
    test[feature] = le.transform(list(test[feature].astype(str).values))
    
for feature in ['id_34', 'id_36']:
    if feature in useful_features:
        # Count encoded for both train and test
        train[feature + '_count_full'] = train[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))
        test[feature + '_count_full'] = test[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))
        
for feature in ['id_01', 'id_31', 'id_33', 'id_35', 'id_36']:
    if feature in useful_features:
        # Count encoded separately for train and test
        train[feature + '_count_dist'] = train[feature].map(train[feature].value_counts(dropna=False))
        test[feature + '_count_dist'] = test[feature].map(test[feature].value_counts(dropna=False))

In [None]:
for col in tqdm_notebook(train.columns):
    if train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))   

In [None]:
X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
y = train.sort_values('TransactionDT')['isFraud']
test = test.sort_values('TransactionDT').drop(['TransactionDT', 'TransactionID'], axis=1)

In [None]:
# params = {'num_leaves': 491,
#           'min_child_weight': 0.03454472573214212,
#           'feature_fraction': 0.3797454081646243,
#           'bagging_fraction': 0.4181193142567742,
#           'min_data_in_leaf': 106,
#           'objective': 'binary',
#           'max_depth': -1,
#           'learning_rate': 0.006883242363721497,
#           "boosting_type": "gbdt",
#           "bagging_seed": 11,
#           "metric": 'auc',
#           "verbosity": -1,
#           'reg_alpha': 0.3899927210061127,
#           'reg_lambda': 0.6485237330340494,
#           'random_state': 47
#          }

**1. Baseline**

In [None]:
# folds = TimeSeriesSplit(n_splits=10)

# aucs = list()
# feature_importances = pd.DataFrame()
# feature_importances['feature'] = X.columns

# training_start_time = time()
# for fold, (trn_idx, test_idx) in enumerate(folds.split(X, y)):
#     start_time = time()
#     print('Training on fold {}'.format(fold + 1))
    
#     trn_data = lgb.Dataset(X.iloc[trn_idx], label=y.iloc[trn_idx])
#     val_data = lgb.Dataset(X.iloc[test_idx], label=y.iloc[test_idx])
#     clf = lgb.train(params, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds=500)
    
#     feature_importances['fold_{}'.format(fold + 1)] = clf.feature_importance()
#     aucs.append(clf.best_score['valid_1']['auc'])
    
#     print('Fold {} finished in {}'.format(fold + 1, str(datetime.timedelta(seconds=time() - start_time))))
# print('-' * 30)
# print('Training has finished.')
# print('Total training time is {}'.format(str(datetime.timedelta(seconds=time() - training_start_time))))
# print('Mean AUC:', np.mean(aucs))
# print('-' * 30)
# baseline_err=np.mean(aucs)

**2. Deletion**

In [None]:
# Xdel = X.dropna()
# Ydel = y.dropna()

In [None]:
# folds = TimeSeriesSplit(n_splits=10)

# aucs = list()
# feature_importances = pd.DataFrame()
# feature_importances['feature'] = X.columns

# training_start_time = time()
# for fold, (trn_idx, test_idx) in enumerate(folds.split(Xdel, Ydel)):
#     start_time = time()
#     print('Training on fold {}'.format(fold + 1))
    
#     trn_data = lgb.Dataset(X.iloc[trn_idx], label=y.iloc[trn_idx])
#     val_data = lgb.Dataset(X.iloc[test_idx], label=y.iloc[test_idx])
#     clf = lgb.train(params, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds=500)
    
#     feature_importances['fold_{}'.format(fold + 1)] = clf.feature_importance()
#     aucs.append(clf.best_score['valid_1']['auc'])
    
#     print('Fold {} finished in {}'.format(fold + 1, str(datetime.timedelta(seconds=time() - start_time))))
# print('-' * 30)
# print('Training has finished.')
# print('Total training time is {}'.format(str(datetime.timedelta(seconds=time() - training_start_time))))
# print('Mean AUC:', np.mean(aucs))
# print('-' * 30)
# deletion_err=np.mean(aucs)

**3. Mean Substitution**

In [None]:
# imp = SimpleImputer(missing_values=np.nan, strategy='mean')
# imp.fit(X)

# Xmean = imp.transform(X)
# Ymean = y


In [None]:
# folds = TimeSeriesSplit(n_splits=10)

# aucs = list()
# feature_importances = pd.DataFrame()
# feature_importances['feature'] = X.columns

# training_start_time = time()
# for fold, (trn_idx, test_idx) in enumerate(folds.split(Xmean, Ymean)):
#     start_time = time()
#     print('Training on fold {}'.format(fold + 1))
    
#     trn_data = lgb.Dataset(X.iloc[trn_idx], label=y.iloc[trn_idx])
#     val_data = lgb.Dataset(X.iloc[test_idx], label=y.iloc[test_idx])
#     clf = lgb.train(params, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds=500)
    
#     feature_importances['fold_{}'.format(fold + 1)] = clf.feature_importance()
#     aucs.append(clf.best_score['valid_1']['auc'])
    
#     print('Fold {} finished in {}'.format(fold + 1, str(datetime.timedelta(seconds=time() - start_time))))
# print('-' * 30)
# print('Training has finished.')
# print('Total training time is {}'.format(str(datetime.timedelta(seconds=time() - training_start_time))))
# print('Mean AUC:', np.mean(aucs))
# print('-' * 30)
# mean_err=np.mean(aucs)

**4. MICE**

In [None]:
# !pip install impyute

In [None]:

#  from impyute.imputation import *

In [None]:
# X_mice=impyute.imputation.cs.mice(X)

**Alternative from fancy impute or sklearn *IterativeImputer*!**

In [None]:
# from fancyimpute import MICE

In [None]:
# Xmice = MICE(n_imputations=200, impute_type='col', verbose=False).complete(X.as_matrix())
# Ymice = y

In [None]:
# folds = TimeSeriesSplit(n_splits=10)

# aucs = list()
# feature_importances = pd.DataFrame()
# feature_importances['feature'] = X.columns

# training_start_time = time()
# for fold, (trn_idx, test_idx) in enumerate(folds.split(Xmice, Ymice)):
#     start_time = time()
#     print('Training on fold {}'.format(fold + 1))
    
#     trn_data = lgb.Dataset(X.iloc[trn_idx], label=y.iloc[trn_idx])
#     val_data = lgb.Dataset(X.iloc[test_idx], label=y.iloc[test_idx])
#     clf = lgb.train(params, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds=500)
    
#     feature_importances['fold_{}'.format(fold + 1)] = clf.feature_importance()
#     aucs.append(clf.best_score['valid_1']['auc'])
    
#     print('Fold {} finished in {}'.format(fold + 1, str(datetime.timedelta(seconds=time() - start_time))))
# print('-' * 30)
# print('Training has finished.')
# print('Total training time is {}'.format(str(datetime.timedelta(seconds=time() - training_start_time))))
# print('Mean AUC:', np.mean(aucs))
# print('-' * 30)
# mice_err=np.mean(aucs)

**6. KNN- first find optimal number of k then substitute**

In [None]:
# from fancyimpute import KNN


# def standardize(s):
#     return s.sub(s.min()).div((s.max() - s.min()))

# Xnorm = X.apply(standardize, axis=0)
# kvals = np.linspace(1, 100, 20, dtype='int64')

# knn_errs = []
# for k in kvals:
#     knn_err = []
#     Xknn = KNN(k=k, verbose=False).complete(Xnorm)
#     knn_err = cross_val_score(RandomForestClassifier(n_estimators=1000,
#                            max_depth=None,
#                            min_samples_split=10), Xknn, Y, cv=10, n_jobs=-1).mean()

#     knn_errs.append(knn_err)
#     print("[KNN] Estimated RF Test Error (n = {}, k = {}, 10-fold CV): {}".format(len(Xknn), k, np.mean(knn_err)))

In [None]:
# sns.set_style("darkgrid")
# _ = plt.plot(kvals, knn_errs)
# _ = plt.xlabel('K')
# _ = plt.ylabel('10-fold CV Error Rate')

# knn_err = max(knn_errs)
# k_opt = kvals[knn_errs.index(knn_err)]

# Xknn = KNN(k=k_opt, verbose=False).complete(Xnorm)
# Yknn = y

# print("[BEST KNN] Estimated RF Test Error (n = {}, k = {}, 10-fold CV): {}".format(len(Xknn), k_opt, np.mean(knn_err)))

check the score:

In [None]:
# folds = TimeSeriesSplit(n_splits=10)

# aucs = list()
# feature_importances = pd.DataFrame()
# feature_importances['feature'] = X.columns

# training_start_time = time()
# for fold, (trn_idx, test_idx) in enumerate(folds.split(Xknn, Yknn)):
#     start_time = time()
#     print('Training on fold {}'.format(fold + 1))
    
#     trn_data = lgb.Dataset(X.iloc[trn_idx], label=y.iloc[trn_idx])
#     val_data = lgb.Dataset(X.iloc[test_idx], label=y.iloc[test_idx])
#     clf = lgb.train(params, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds=500)
    
#     feature_importances['fold_{}'.format(fold + 1)] = clf.feature_importance()
#     aucs.append(clf.best_score['valid_1']['auc'])
    
#     print('Fold {} finished in {}'.format(fold + 1, str(datetime.timedelta(seconds=time() - start_time))))
# print('-' * 30)
# print('Training has finished.')
# print('Total training time is {}'.format(str(datetime.timedelta(seconds=time() - training_start_time))))
# print('Mean AUC:', np.mean(aucs))
# print('-' * 30)
# knn_err=np.mean(aucs)

**7. Expectation minimazation**

In [None]:
# X_em=impyute.imputation.cs.em(X, loops=50)

In [None]:
# folds = TimeSeriesSplit(n_splits=10)

# aucs = list()
# feature_importances = pd.DataFrame()
# feature_importances['feature'] = X.columns

# training_start_time = time()
# for fold, (trn_idx, test_idx) in enumerate(folds.split(X_em, y)):
#     start_time = time()
#     print('Training on fold {}'.format(fold + 1))
    
#     trn_data = lgb.Dataset(X.iloc[trn_idx], label=y.iloc[trn_idx])
#     val_data = lgb.Dataset(X.iloc[test_idx], label=y.iloc[test_idx])
#     clf = lgb.train(params, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds=500)
    
#     feature_importances['fold_{}'.format(fold + 1)] = clf.feature_importance()
#     aucs.append(clf.best_score['valid_1']['auc'])
    
#     print('Fold {} finished in {}'.format(fold + 1, str(datetime.timedelta(seconds=time() - start_time))))
# print('-' * 30)
# print('Training has finished.')
# print('Total training time is {}'.format(str(datetime.timedelta(seconds=time() - training_start_time))))
# print('Mean AUC:', np.mean(aucs))
# print('-' * 30)
# em_err=np.mean(aucs)

**8. Moving window imputation**

In [None]:
# impyute.imputation.ts.moving_window(X, nindex=None, wsize=50000, errors='coerce', inplace=False)


**9. [DataFrame impute with different arguments:](https://www.geeksforgeeks.org/python-pandas-dataframe-interpolate/) Things that I saw it works:
**

In [None]:

# X_barycentric=X.interpolate(method="barycentric", axis=0, inplace=False)

In [None]:
# X_linear=X.interpolate(method='linear', axis=0, limit=None, inplace=False, limit_direction="forward")


In [None]:
# X_interp_spline=X.interpolate(method='spline', axis=0, limit=None, inplace=False, limit_direction="forward")


Change you own arguments and test it quickly, just rely on you CV scheme ;)

**10. SoftImpute** needs normalised data as knn

In [None]:

X_soft=SoftImpute().fit_transform(Xnorm)

NOTE

# NOTE:
I do not use this cv scheme locally!