In [25]:
import os
import gc
import numpy as np
import pandas as pd
import joblib
from datetime import datetime

from lightgbm import LGBMClassifier
import optuna
from prunedcv import PrunedCV

from codes.utils import import_data, cross_val_score_auc, reduce_mem_usage, fix_dtypes
from codes.fe_browser import latest
from codes.fe_emails import proton, mappings
from codes.fe_cards import stats
from codes.fe_date import dates
from codes.fe_relatives import divisions, divisions_float
from codes.fe_categorical import pairs, wtf, cat_limit, encode_cat
from codes.prepro import prepro
from codes.fe_users import users_stats

from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import make_pipeline
from sklearn.base import TransformerMixin

In [30]:
DATA_PATH = '../input/'

In [None]:
if os.path.isfile('features_train.pkl'):
    X_train = joblib.load('features_train.pkl')
    X_test = joblib.load('features_test.pkl')
    y_train = joblib.load('y_train.pkl')
    
else:

    print('fix_dtypes')
    X_train, X_test = fix_dtypes(X_train, X_test)
    print('users_stats')
    X_train, X_test = users_stats(X_train, X_test)
    print('latest')
    X_train, X_test = latest(X_train, X_test)
    print('proton')
    X_train, X_test = proton(X_train, X_test)
    print('nulls1')
    X_train['nulls1'] = X_train.isna().sum(axis=1)
    X_test['nulls1'] = X_test.isna().sum(axis=1)
    print('mappings')
    X_train, X_test = mappings(X_train, X_test)
    print('stats')
    X_train, X_test = stats(X_train, X_test)
    print('divisions')
    X_train, X_test = divisions(X_train, X_test)
    print('dates')
    X_train, X_test = dates(X_train, X_test)
    print('pairs')
    X_train, X_test = pairs(X_train, X_test)
    print('encode_cat')
    X_train, X_test = encode_cat(X_train, X_test)
    print('wtf')
    X_train, X_test = wtf(X_train, X_test)
    print('y')
    y_train = X_train['isFraud'].copy()
    X_train = X_train.drop('isFraud', axis=1)
    print('divisions_float')
    X_train, X_test = divisions_float(X_train, X_test)
    print('prepro')
    X_train, X_test = prepro(X_train, X_test)
    print('reduce_mem_usage')
    # X_train = reduce_mem_usage(X_train)
    # X_test = reduce_mem_usage(X_test)
    print('np.inf')
    X_train[X_train == np.inf] = -999
    X_train[X_train == -np.inf] = -999
    X_train[X_train.isna()] = -999
    X_test[X_test == np.inf] = -999
    X_test[X_test == -np.inf] = -999
    X_test[X_test.isna()] = -999
    print('TransactionDT')
    X_test.drop(['TransactionDT'], axis=1, inplace=True)
    X_train.drop(['TransactionDT'], axis=1, inplace=True)
    joblib.dump(X_train, 'features_train.pkl')
    joblib.dump(X_test, 'features_test.pkl')
    joblib.dump(y_train, 'y_train.pkl')

In [None]:
all_columns = list(X_train.columns)

columns_num_base = list(set(
    ['TransactionAmt',
     'user_trx_cnt',
     'nulls1',
     'dist1',
     'dist2'] +
    ['C{}'.format(i) for i in range(1,15)] \
    + ['D{}'.format(i) for i in range(1,16)] \
    + ['V' + str(i) for i in range(1,340)]))

num_cols = []
for col in all_columns:
    for col2 in columns_num_base:
        if col2 in col:
            num_cols.append(col)
            
spec_nums = ['_count_', 
             '_mean_', 
             '_std_', 
             'Transaction_day_of_week',
             'Transaction_hour_of_day']
spec_nums_cols = []
for col in all_columns:
    for col2 in spec_nums:
        if col2 in col:
            spec_nums_cols.append(col)

num_cols = list(set(num_cols + columns_num_base + spec_nums_cols))
cat_cols = [col for col in all_columns if col not in num_cols]

In [None]:
num_cols_fin = [col for col in num_cols if col in list(X_train.columns)]
cat_cols_fin = [col for col in cat_cols if col in list(X_train.columns)]

In [None]:
CORR = .99
X_corr = X_train.loc[:, num_cols_fin]
print('calculating corrs 1 for ', X_corr.shape[1])
corr_matrix = X_corr.corr(method='spearman').abs()
del X_corr
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > CORR)]
print('dropped because of high corr (', CORR, '): ', len(to_drop))
X_train.drop(to_drop, axis=1, inplace=True)
X_test.drop(to_drop, axis=1, inplace=True)
joblib.dump(X_train, 'features_train.pkl')
joblib.dump(X_test, 'features_test.pkl')