# 사전작업

## 모듈 임포트

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import gc

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_columns', 400)

In [4]:
path = './data/'

In [5]:
# load csv
history = pd.read_csv(path + 'historical_transactions.csv')

## Feature Engineering

### util functions

In [6]:
# reduce memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [7]:
from scipy import stats
def mode(x):
    return stats.mode(x)[0][0]

In [8]:
def null_cnt(x):
    return np.sum(x.isna() * 1)

In [9]:
def over_550(data):
    return len(data[data > 550])

In [10]:
def ref_date(data):
    ref_month = data.purchase_month - data.month_lag
    
    if ref_month % 12 == 0:
        ref_year = ref_month // 12 - 1
        ref_month = 12
        
    else:
        ref_year = ref_month // 12
        ref_month = ref_month % 12
        
    ref_year = data.purchase_year + ref_year
    return datetime.datetime(ref_year, ref_month, 1, 0, 0, 0)

### train test

In [11]:
# preprocessing train & test
def train_test(debug=False):

    # load csv
    train = pd.read_csv(path + 'train.csv', index_col=['card_id'])
    test = pd.read_csv(path + 'test.csv', index_col=['card_id'])
        
    # null processing
    test.loc['C_ID_c27b4f80f7', 'first_active_month'] = '2017-03'
    
    if debug:
        train = train[:1000]
        test = test[:1000]
    
    # outlier
    train['outliers'] = 0
    train.loc[train['target'] < -30, 'outliers'] = 1

    # set target as nan
    test['target'] = np.nan

    # merge
    df = train.append(test)

    del train, test
    gc.collect()

    # first_active_month
    df['first_active'] = pd.to_datetime(df['first_active_month'])
    df['first_active_year'] = df['first_active'].dt.year
    df['first_active_month'] = df['first_active'].dt.month
    df['first_active_elapsed_time_from_trade'] = (datetime.datetime(2018, 4, 30, 23, 59, 59) - df['first_active']).dt.days
    df['first_active_total_day'] = pd.to_timedelta(df['first_active']).dt.total_seconds() / (60 * 60 * 24)

#     df['first_active_quarter'] = df['first_active'].dt.quarter
#     df['first_active_weekofyear'] = df['first_active'].dt.weekofyear
#     df['first_active_dayofweek'] = df['first_active'].dt.dayofweek
#     df['first_active_elapsed_time_from_today'] = (datetime.datetime.today() - df['first_active']).dt.days

#     df['days_feature1_trade'] = df['feature_1'] * df['first_active_elapsed_time_from_trade']
#     df['days_feature2_trade'] = df['feature_2'] * df['first_active_elapsed_time_from_trade'] 
#     df['days_feature3_trade'] = df['feature_3'] * df['first_active_elapsed_time_from_trade'] 

#     df['days_feature1_trade_ratio'] = df['feature_1'] / df['first_active_elapsed_time_from_trade']
#     df['days_feature2_trade_ratio'] = df['feature_2'] / df['first_active_elapsed_time_from_trade']
#     df['days_feature3_trade_ratio'] = df['feature_3'] / df['first_active_elapsed_time_from_trade']
    
#     for f in ['feature_1','feature_2','feature_3', 'first_active_year', 'first_active_month', 'first_active_total_day']:
#         order_label = df.groupby([f])['outliers'].mean()
#         df[f + '_outlier'] = df[f].map(order_label)

#     df['feature_sum'] = df['feature_1'] + df['feature_2'] + df['feature_3']
#     df['feature_mean'] = df['feature_sum'] / 3
#     df['feature_max'] = df[['feature_1', 'feature_2', 'feature_3']].max(axis=1)
#     df['feature_min'] = df[['feature_1', 'feature_2', 'feature_3']].min(axis=1)
#     df['feature_var'] = df[['feature_1', 'feature_2', 'feature_3']].std(axis=1)
    
#     df['feature_sum_outlier'] = df['feature_1_outlier'] + df['feature_2_outlier'] + df['feature_3_outlier']
#     df['feature_mean_outlier'] = df['feature_sum_outlier'] / 3
#     df['feature_max_outlier'] = df[['feature_1_outlier', 'feature_2_outlier', 'feature_3_outlier']].max(axis=1)
#     df['feature_min_outlier'] = df[['feature_1_outlier', 'feature_2_outlier', 'feature_3_outlier']].min(axis=1)
#     df['feature_var_outlier'] = df[['feature_1_outlier', 'feature_2_outlier', 'feature_3_outlier']].std(axis=1)

    return df

In [12]:
train_test(debug=True).head(3)

Unnamed: 0_level_0,feature_1,feature_2,feature_3,first_active_month,outliers,target,first_active,first_active_year,first_active_elapsed_time_from_trade,first_active_total_day
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C_ID_92a2005557,5,2,1,6,0.0,-0.820283,2017-06-01,2017,333,17318.0
C_ID_3d0044924f,4,1,0,1,0.0,0.392913,2017-01-01,2017,484,17167.0
C_ID_d639edf6cd,2,2,0,8,0.0,0.688056,2016-08-01,2016,637,17014.0


In [13]:
train_test().to_csv('./data_feature_engineering/train_test_v5.csv')

### historical_transactions

In [16]:
# preprocessing historical transactions
def historical_transactions(hist_df, debug = False):
    
    if debug:
        hist_df = hist_df[:1000]

    # fillna
    hist_df['category_2'].fillna(1.0,inplace=True)
    hist_df['category_3'].fillna('A',inplace=True)
    hist_df['merchant_id'].fillna(np.nan,inplace=True)
    hist_df['installments'].replace(-1, np.nan,inplace=True)
    hist_df['installments'].replace(999, np.nan,inplace=True)

    # trim
    hist_df['purchase_amount_trim'] = hist_df['purchase_amount'].apply(lambda x: min(x, 0.8))

    # Y/N to 1/0
    hist_df['authorized_flag_label'] = hist_df['authorized_flag'].map({'Y': 1, 'N': 0}).astype(int)
    hist_df['category_1_label'] = hist_df['category_1'].map({'Y': 1, 'N': 0}).astype(int)
    hist_df['category_3_label'] = hist_df['category_3'].map({'A':0, 'B':1, 'C':2})

    # purchase date
    hist_df['purchase_date'] = pd.to_datetime(hist_df['purchase_date'])
    hist_df['purchase_year'] = hist_df['purchase_date'].dt.year
    hist_df['purchase_month'] = hist_df['purchase_date'].dt.month
    hist_df['purchase_day'] = hist_df['purchase_date'].dt.day
    hist_df['purchase_hour'] = hist_df['purchase_date'].dt.hour
    hist_df['purchase_dayofweek'] = hist_df['purchase_date'].dt.dayofweek
    hist_df['purchase_weekofyear'] = hist_df['purchase_date'].dt.weekofyear
    hist_df['purchase_weekend'] = (hist_df['purchase_date'].dt.weekday >=5).astype(int)
    
    # additional features
    hist_df['price'] = hist_df['purchase_amount'] / hist_df['installments']
    
    hist_df['Christmas_Day_2017']=(pd.to_datetime('2017-12-25')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    hist_df['Mothers_Day_2017']=(pd.to_datetime('2017-06-04')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    hist_df['fathers_day_2017']=(pd.to_datetime('2017-08-13')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    hist_df['Children_day_2017']=(pd.to_datetime('2017-10-12')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    hist_df['Valentine_Day_2017']=(pd.to_datetime('2017-06-12')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    hist_df['Black_Friday_2017']=(pd.to_datetime('2017-11-24') - hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    hist_df['Mothers_Day_2018']=(pd.to_datetime('2018-05-13')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    
    # new
    hist_df['purchase_date_total_day'] = pd.to_timedelta(hist_df['purchase_date']).dt.total_seconds() / (60 * 60 * 24)
    hist_df['ref_date'] = hist_df.apply(ref_date, axis = 1)
    
    hist_df['month_diff_from_trade'] = ((datetime.datetime(2018, 4, 30, 23, 59, 59) - hist_df['purchase_date']).dt.days)//30
    hist_df['month_diff_from_trade'] += hist_df['month_lag']
    hist_df['month_diff_from_today'] = ((datetime.datetime.today() - hist_df['purchase_date']).dt.days)//30
    hist_df['month_diff_from_today'] += hist_df['month_lag']
    
    # reduce memory usage
    hist_df = reduce_mem_usage(hist_df)
    
    aggs = {}
    aggs['authorized_flag_label'] = [mode, 'sum', 'mean']
    aggs['card_id'] = ['size']
    aggs['city_id'] = [mode, 'nunique']
    aggs['category_1_label'] = [mode, 'sum', 'mean']
    aggs['installments'] = [mode, 'sum', 'mean', 'var', 'max', 'min', null_cnt]
    aggs['category_3_label'] = [mode, 'mean']
    aggs['merchant_category_id'] = [mode, 'nunique']
    aggs['merchant_id'] = ['nunique']
    aggs['month_lag'] = [mode, 'sum', 'mean', 'var', 'max', 'min', 'skew']
    aggs['purchase_date'] = ['max','min']
    aggs['purchase_amount'] = ['sum', 'mean', 'var', 'max', 'min', 'skew', over_550]
    aggs['category_2'] = [mode, 'mean']
    aggs['state_id'] = [mode, 'nunique']
    aggs['subsector_id'] = [mode, 'nunique']
    aggs['purchase_amount_trim'] = ['sum', 'mean', 'var', 'max', 'min', 'skew']
    aggs['purchase_year'] = [mode, 'nunique', 'mean', 'max', 'min']
    aggs['purchase_month'] = [mode, 'nunique', 'mean', 'max', 'min']
    aggs['purchase_day'] = [mode, 'nunique', 'mean', 'var', 'max', 'min', 'skew']
    aggs['purchase_hour'] = [mode, 'nunique', 'mean', 'var', 'max', 'min', 'skew']
    aggs['purchase_dayofweek'] = [mode, 'nunique', 'mean', 'max', 'min']
    aggs['purchase_weekofyear'] = [mode, 'nunique', 'mean', 'max', 'min']
    aggs['purchase_weekend'] = [mode, 'sum', 'mean']
    aggs['price'] = ['sum','mean','max','min','var']
    aggs['Christmas_Day_2017'] = ['mean']
    aggs['Mothers_Day_2017'] = ['mean']
    aggs['fathers_day_2017'] = ['mean']
    aggs['Children_day_2017'] = ['mean']
    aggs['Valentine_Day_2017'] = ['mean']
    aggs['Black_Friday_2017'] = ['mean']
    aggs['Mothers_Day_2018'] = ['mean']
    
    # new
    aggs['purchase_date_total_day'] = ['max','min','mean','var','skew']
    aggs['ref_date'] = [mode]
    aggs['month_diff_from_trade'] = ['max','min','mean','var','skew']
    aggs['month_diff_from_today'] = ['max','min','mean','var','skew']
    
    for col in ['category_2','category_3']:
        hist_df[col+'_mean'] = hist_df.groupby([col])['purchase_amount'].transform('mean')
        hist_df[col+'_min'] = hist_df.groupby([col])['purchase_amount'].transform('min')
        hist_df[col+'_max'] = hist_df.groupby([col])['purchase_amount'].transform('max')
        hist_df[col+'_sum'] = hist_df.groupby([col])['purchase_amount'].transform('sum')
        aggs[col+'_mean'] = ['mean']
        aggs[col+'_min'] = ['mean']
        aggs[col+'_max'] = ['mean']
        aggs[col+'_sum'] = ['mean']
    
    hist_df = hist_df.groupby('card_id').agg(aggs)
    
    # change column name
    hist_df.columns = pd.Index([e[0] + "_" + e[1] for e in hist_df.columns.tolist()])
    hist_df.columns = ['hist_'+ c for c in hist_df.columns]

    hist_df['hist_purchase_date_diff'] = (hist_df['hist_purchase_date_max']-hist_df['hist_purchase_date_min']).dt.days
    hist_df['hist_purchase_date_average'] = hist_df['hist_purchase_date_diff']/hist_df['hist_card_id_size']
    hist_df['hist_purchase_date_uptonow'] = (datetime.datetime.today()-hist_df['hist_purchase_date_max']).dt.days
    hist_df['hist_purchase_date_uptomin'] = (datetime.datetime.today()-hist_df['hist_purchase_date_min']).dt.days

    # reduce memory usage
    hist_df = reduce_mem_usage(hist_df)

    return hist_df

In [17]:
historical_transactions(history, debug=True).head(3)

Memory usage after optimization is: 0.10 MB
Decreased by 64.4%
Memory usage after optimization is: 0.00 MB
Decreased by 54.3%


Unnamed: 0_level_0,hist_authorized_flag_label_mode,hist_authorized_flag_label_sum,hist_authorized_flag_label_mean,hist_card_id_size,hist_city_id_mode,hist_city_id_nunique,hist_category_1_label_mode,hist_category_1_label_sum,hist_category_1_label_mean,hist_installments_mode,hist_installments_sum,hist_installments_mean,hist_installments_var,hist_installments_max,hist_installments_min,hist_installments_null_cnt,hist_category_3_label_mode,hist_category_3_label_mean,hist_merchant_category_id_mode,hist_merchant_category_id_nunique,hist_merchant_id_nunique,hist_month_lag_mode,hist_month_lag_sum,hist_month_lag_mean,hist_month_lag_var,hist_month_lag_max,hist_month_lag_min,hist_month_lag_skew,hist_purchase_date_max,hist_purchase_date_min,hist_purchase_amount_sum,hist_purchase_amount_mean,hist_purchase_amount_var,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_skew,hist_purchase_amount_over_550,hist_category_2_mode,hist_category_2_mean,hist_state_id_mode,hist_state_id_nunique,hist_subsector_id_mode,hist_subsector_id_nunique,hist_purchase_amount_trim_sum,hist_purchase_amount_trim_mean,hist_purchase_amount_trim_var,hist_purchase_amount_trim_max,hist_purchase_amount_trim_min,hist_purchase_amount_trim_skew,hist_purchase_year_mode,hist_purchase_year_nunique,hist_purchase_year_mean,hist_purchase_year_max,hist_purchase_year_min,hist_purchase_month_mode,hist_purchase_month_nunique,hist_purchase_month_mean,hist_purchase_month_max,hist_purchase_month_min,hist_purchase_day_mode,hist_purchase_day_nunique,hist_purchase_day_mean,hist_purchase_day_var,hist_purchase_day_max,hist_purchase_day_min,hist_purchase_day_skew,hist_purchase_hour_mode,hist_purchase_hour_nunique,hist_purchase_hour_mean,hist_purchase_hour_var,hist_purchase_hour_max,hist_purchase_hour_min,hist_purchase_hour_skew,hist_purchase_dayofweek_mode,hist_purchase_dayofweek_nunique,hist_purchase_dayofweek_mean,hist_purchase_dayofweek_max,hist_purchase_dayofweek_min,hist_purchase_weekofyear_mode,hist_purchase_weekofyear_nunique,hist_purchase_weekofyear_mean,hist_purchase_weekofyear_max,hist_purchase_weekofyear_min,hist_purchase_weekend_mode,hist_purchase_weekend_sum,hist_purchase_weekend_mean,hist_price_sum,hist_price_mean,hist_price_max,hist_price_min,hist_price_var,hist_Christmas_Day_2017_mean,hist_Mothers_Day_2017_mean,hist_fathers_day_2017_mean,hist_Children_day_2017_mean,hist_Valentine_Day_2017_mean,hist_Black_Friday_2017_mean,hist_Mothers_Day_2018_mean,hist_purchase_date_total_day_max,hist_purchase_date_total_day_min,hist_purchase_date_total_day_mean,hist_purchase_date_total_day_var,hist_purchase_date_total_day_skew,hist_ref_date_mode,hist_month_diff_from_trade_max,hist_month_diff_from_trade_min,hist_month_diff_from_trade_mean,hist_month_diff_from_trade_var,hist_month_diff_from_trade_skew,hist_month_diff_from_today_max,hist_month_diff_from_today_min,hist_month_diff_from_today_mean,hist_month_diff_from_today_var,hist_month_diff_from_today_skew,hist_category_2_mean_mean,hist_category_2_min_mean,hist_category_2_max_mean,hist_category_2_sum_mean,hist_category_3_mean_mean,hist_category_3_min_mean,hist_category_3_max_mean,hist_category_3_sum_mean,hist_purchase_date_diff,hist_purchase_date_average,hist_purchase_date_uptonow,hist_purchase_date_uptomin
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1
C_ID_0e171c1b48,1,261.0,0.935547,279,277,4,0,0,0.0,0.0,3.0,0.01075,0.010674,1.0,0.0,0.0,0,0.01075,705,36,72,-8,-1550.0,-5.554688,12.046875,0,-12,0.108337,2018-02-27 13:48:19,2017-02-25 09:38:46,-197.375,-0.70752,0.004387,-0.296143,-0.745605,4.03125,0.0,4.0,3.695312,13,3,33,17,-197.375,-0.70752,0.004387,-0.296143,-0.745605,4.03125,2017,2,2017.0,2018,2017,2,12,6.335938,12,1,20,31,15.84375,78.1875,31,1,-0.064514,20,20,15.71875,45.15625,23,0,-1.161133,5,7,3.476562,6,0,8,50,25.578125,52,1,0,112,0.401367,-inf,-inf,-0.56659,-inf,,9.351562,10.835938,15.789062,14.148438,11.390625,13.539062,8.234375,17584.0,17216.0,17408.0,11344.0,,2018-02-01,3,1,2.078125,0.094482,1.952148,12,11,11.5,0.250977,-0.007206,-0.673828,-0.745605,1.546875,-178.625,-0.712402,-0.745605,0.434082,-484.0,367,1.31543,346,713
C_ID_48fb13e70f,1,79.0,0.987305,80,69,2,0,5,0.0625,1.0,93.0,1.162109,0.289795,4.0,1.0,0.0,1,1.099609,278,28,43,-8,-370.0,-4.625,8.132812,0,-8,0.063049,2018-02-23 09:37:27,2017-06-01 18:51:35,-45.28125,-0.565918,0.053192,0.765137,-0.737793,3.400391,0.0,1.0,1.0,9,2,37,16,-45.28125,-0.565918,0.053192,0.765137,-0.737793,3.400391,2017,2,2017.0,2018,2017,6,9,7.726562,12,1,20,29,14.671875,61.625,31,1,-0.07312,0,18,14.625,45.90625,23,0,-0.98291,1,7,2.662109,6,0,23,22,31.59375,51,3,0,10,0.125,-44.862034,-0.560775,0.191227,-0.737892,0.043884,20.03125,0.0625,21.359375,3.412109,1.075195,12.273438,10.359375,17584.0,17312.0,17440.0,7744.0,,2018-02-01,3,2,2.074219,0.070251,3.289062,12,11,11.523438,0.252441,-0.102051,-0.666016,-0.745605,1.421875,-402.5,-0.4646,-0.738281,1.552734,-136.0,266,3.324219,350,616
C_ID_4e6213e9bc,1,391.0,0.977539,400,88,6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,80,18,55,-5,-2428.0,-6.074219,10.984375,0,-13,0.232056,2018-02-28 17:00:08,2017-01-20 09:16:43,-290.75,-0.727051,0.000836,-0.249268,-0.745605,11.796875,0.0,1.0,1.0,16,3,37,11,-290.75,-0.727051,0.000836,-0.249268,-0.745605,11.796875,2017,2,2017.0,2018,2017,9,12,6.335938,12,1,14,31,15.648438,71.0625,31,1,0.034912,12,21,13.796875,25.171875,23,0,-0.555176,2,7,2.806641,6,0,37,44,25.640625,50,1,0,83,0.20752,-inf,-inf,-inf,-inf,,17.265625,11.5,14.703125,17.6875,13.109375,23.015625,10.507812,17584.0,17184.0,17392.0,10368.0,,2018-02-01,3,2,2.080078,0.073792,3.107422,12,11,11.539062,0.248779,-0.171265,-0.666016,-0.745605,1.421875,-402.5,-0.714844,-0.745605,0.421875,-487.5,404,1.009766,345,749


In [49]:
historical_transactions(history).to_csv('./data_feature_engineering/history_v1.csv')

Memory usage after optimization is: 2998.48 MB
Decreased by 62.0%
Memory usage after optimization is: 88.79 MB
Decreased by 54.2%


### new_merchant_transactions

In [44]:
# preprocessing new_merchant_transactions
def new_merchant_transactions(debug=False):
    
    # load csv
    new_merchant_df = pd.read_csv(path + 'new_merchant_transactions.csv')
    
    if debug:
        new_merchant_df = new_merchant_df[:1000]

    # fillna
    new_merchant_df['category_2'].fillna(1.0,inplace=True)
    new_merchant_df['category_3'].fillna('A',inplace=True)
    new_merchant_df['merchant_id'].fillna(np.nan,inplace=True)
    new_merchant_df['installments'].replace(-1, np.nan,inplace=True)
    new_merchant_df['installments'].replace(999, np.nan,inplace=True)

    # trim
    new_merchant_df['purchase_amount_trim'] = new_merchant_df['purchase_amount'].apply(lambda x: min(x, 0.8))

    # Y/N to 1/0
    new_merchant_df['authorized_flag'] = new_merchant_df['authorized_flag'].map({'Y': 1, 'N': 0}).astype(int)
    new_merchant_df['category_1'] = new_merchant_df['category_1'].map({'Y': 1, 'N': 0}).astype(int)
    new_merchant_df['category_3'] = new_merchant_df['category_3'].map({'A':0, 'B':1, 'C':2})

    # purchase date
    new_merchant_df['purchase_date'] = pd.to_datetime(new_merchant_df['purchase_date'])
    new_merchant_df['purchase_year'] = new_merchant_df['purchase_date'].dt.year
    new_merchant_df['purchase_month'] = new_merchant_df['purchase_date'].dt.month
    new_merchant_df['purchase_day'] = new_merchant_df['purchase_date'].dt.day
    new_merchant_df['purchase_hour'] = new_merchant_df['purchase_date'].dt.hour
    new_merchant_df['purchase_dayofweek'] = new_merchant_df['purchase_date'].dt.dayofweek
    new_merchant_df['purchase_weekofyear'] = new_merchant_df['purchase_date'].dt.weekofyear
    new_merchant_df['purchase_weekend'] = (new_merchant_df['purchase_date'].dt.weekday >=5).astype(int)

    # additional features
    new_merchant_df['price'] = new_merchant_df['purchase_amount'] / new_merchant_df['installments']
    
    new_merchant_df['month_diff'] = ((datetime.datetime(2018, 4, 30, 23, 59, 59) - new_merchant_df['purchase_date']).dt.days)//30
    new_merchant_df['month_diff'] += new_merchant_df['month_lag']

    new_merchant_df['Christmas_Day_2017']=(pd.to_datetime('2017-12-25')-new_merchant_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    new_merchant_df['Children_day_2017']=(pd.to_datetime('2017-10-12')-new_merchant_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    new_merchant_df['Black_Friday_2017']=(pd.to_datetime('2017-11-24') - new_merchant_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    new_merchant_df['Mothers_Day_2018']=(pd.to_datetime('2018-05-13')-new_merchant_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

    # reduce memory usage
    new_merchant_df = reduce_mem_usage(new_merchant_df)
    
    aggs = {}
    aggs['authorized_flag'] = [mode, 'sum', 'mean']
    aggs['card_id'] = ['size']
    aggs['city_id'] = [mode, 'nunique']
    aggs['category_1'] = [mode, 'sum', 'mean']
    aggs['installments'] = [mode, 'sum', 'mean', 'var', 'max', 'min', null_cnt]
    aggs['category_3'] = [mode, 'mean']
    aggs['merchant_category_id'] = [mode, 'nunique']
    aggs['merchant_id'] = ['nunique']
    aggs['month_lag'] = [mode, 'sum', 'mean', 'var', 'max', 'min', 'skew']
    aggs['purchase_date'] = ['max','min']
    aggs['purchase_amount'] = ['sum', 'mean', 'var', 'max', 'min', 'skew', over_550]
    aggs['category_2'] = [mode, 'mean']
    aggs['state_id'] = [mode, 'nunique']
    aggs['subsector_id'] = [mode, 'nunique']
    aggs['purchase_amount_trim'] = ['sum', 'mean', 'var', 'max', 'min', 'skew']
    aggs['purchase_year'] = [mode, 'nunique', 'mean', 'max', 'min']
    aggs['purchase_month'] = [mode, 'nunique', 'mean', 'max', 'min']
    aggs['purchase_day'] = [mode, 'nunique', 'mean', 'var', 'max', 'min', 'skew']
    aggs['purchase_hour'] = [mode, 'nunique', 'mean', 'var', 'max', 'min', 'skew']
    aggs['purchase_dayofweek'] = [mode, 'nunique', 'mean', 'max', 'min']
    aggs['purchase_weekofyear'] = [mode, 'nunique', 'mean', 'max', 'min']
    aggs['purchase_weekend'] = [mode, 'sum', 'mean']
    aggs['price'] = ['sum','mean','max','min','var']
    aggs['month_diff'] = ['max','min','mean','var','skew']
    aggs['Christmas_Day_2017'] = ['mean']
    aggs['Children_day_2017'] = ['mean']
    aggs['Black_Friday_2017'] = ['mean']
    aggs['Mothers_Day_2018'] = ['mean']

    for col in ['category_2','category_3']:
        new_merchant_df[col+'_mean'] = new_merchant_df.groupby([col])['purchase_amount'].transform('mean')
        new_merchant_df[col+'_min'] = new_merchant_df.groupby([col])['purchase_amount'].transform('min')
        new_merchant_df[col+'_max'] = new_merchant_df.groupby([col])['purchase_amount'].transform('max')
        new_merchant_df[col+'_sum'] = new_merchant_df.groupby([col])['purchase_amount'].transform('sum')
        aggs[col+'_mean'] = ['mean']

    new_merchant_df = new_merchant_df.groupby('card_id').agg(aggs)

    # change column name
    new_merchant_df.columns = pd.Index([e[0] + "_" + e[1] for e in new_merchant_df.columns.tolist()])
    new_merchant_df.columns = ['new_'+ c for c in new_merchant_df.columns]

    new_merchant_df['new_purchase_date_diff'] = (new_merchant_df['new_purchase_date_max']-new_merchant_df['new_purchase_date_min']).dt.days
    new_merchant_df['new_purchase_date_average'] = new_merchant_df['new_purchase_date_diff']/new_merchant_df['new_card_id_size']
    new_merchant_df['new_purchase_date_uptonow'] = (datetime.datetime.today()-new_merchant_df['new_purchase_date_max']).dt.days
    new_merchant_df['new_purchase_date_uptomin'] = (datetime.datetime.today()-new_merchant_df['new_purchase_date_min']).dt.days

    # reduce memory usage
    new_merchant_df = reduce_mem_usage(new_merchant_df)

    return new_merchant_df

In [46]:
new_merchant_transactions(debug=True).head(3)

Memory usage after optimization is: 0.06 MB
Decreased by 70.6%
Memory usage after optimization is: 0.02 MB
Decreased by 54.4%


Unnamed: 0_level_0,new_authorized_flag_mode,new_authorized_flag_sum,new_authorized_flag_mean,new_card_id_size,new_city_id_mode,new_city_id_nunique,new_category_1_mode,new_category_1_sum,new_category_1_mean,new_installments_mode,new_installments_sum,new_installments_mean,new_installments_var,new_installments_max,new_installments_min,new_installments_null_cnt,new_category_3_mode,new_category_3_mean,new_merchant_category_id_mode,new_merchant_category_id_nunique,new_merchant_id_nunique,new_month_lag_mode,new_month_lag_sum,new_month_lag_mean,new_month_lag_var,new_month_lag_max,new_month_lag_min,new_month_lag_skew,new_purchase_date_max,new_purchase_date_min,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_var,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_skew,new_purchase_amount_over_550,new_category_2_mode,new_category_2_mean,new_state_id_mode,new_state_id_nunique,new_subsector_id_mode,new_subsector_id_nunique,new_purchase_amount_trim_sum,new_purchase_amount_trim_mean,new_purchase_amount_trim_var,new_purchase_amount_trim_max,new_purchase_amount_trim_min,new_purchase_amount_trim_skew,new_purchase_year_mode,new_purchase_year_nunique,new_purchase_year_mean,new_purchase_year_max,new_purchase_year_min,new_purchase_month_mode,new_purchase_month_nunique,new_purchase_month_mean,new_purchase_month_max,new_purchase_month_min,new_purchase_day_mode,new_purchase_day_nunique,new_purchase_day_mean,new_purchase_day_var,new_purchase_day_max,new_purchase_day_min,new_purchase_day_skew,new_purchase_hour_mode,new_purchase_hour_nunique,new_purchase_hour_mean,new_purchase_hour_var,new_purchase_hour_max,new_purchase_hour_min,new_purchase_hour_skew,new_purchase_dayofweek_mode,new_purchase_dayofweek_nunique,new_purchase_dayofweek_mean,new_purchase_dayofweek_max,new_purchase_dayofweek_min,new_purchase_weekofyear_mode,new_purchase_weekofyear_nunique,new_purchase_weekofyear_mean,new_purchase_weekofyear_max,new_purchase_weekofyear_min,new_purchase_weekend_mode,new_purchase_weekend_sum,new_purchase_weekend_mean,new_price_sum,new_price_mean,new_price_max,new_price_min,new_price_var,new_month_diff_max,new_month_diff_min,new_month_diff_mean,new_month_diff_var,new_month_diff_skew,new_Christmas_Day_2017_mean,new_Children_day_2017_mean,new_Black_Friday_2017_mean,new_Mothers_Day_2018_mean,new_duration_mean,new_duration_min,new_duration_max,new_duration_var,new_duration_skew,new_amount_month_ratio_mean,new_amount_month_ratio_min,new_amount_month_ratio_max,new_amount_month_ratio_var,new_amount_month_ratio_skew,new_category_2_mean_mean,new_category_3_mean_mean,new_purchase_date_diff,new_purchase_date_average,new_purchase_date_uptonow,new_purchase_date_uptomin
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1
C_ID_0682c61725,1,2,1,2,19,2,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,222,2,2,1,3,1.5,0.5,2,1,,2018-04-09 11:49:24,2018-03-05 09:39:03,-1.293945,-0.646973,0.010162,-0.575684,-0.718262,,0.0,1.0,1.0,9,1,21,1,-1.293945,-0.646973,0.010162,-0.575684,-0.718262,,2018,1,2018.0,2018,2018,3,2,3.5,4,3,5,2,7.0,8.0,9,5,,9,2,10.0,2.0,11,9,,0,1,0.0,0,0,10,2,12.5,15,10,0,0,0.0,-inf,-inf,-inf,-inf,,2,2,2.0,0.0,,0.0,0.0,0.0,50.5,-1.293945,-1.436523,-1.151367,0.040649,,-0.323486,-0.359131,-0.287842,0.002541,,-0.550293,-0.541504,35,17.5,303,338
C_ID_086fe1da99,1,22,1,22,69,5,0,1,0.045441,1.0,22.0,1.0,0.0,1.0,1.0,0.0,1,1.0,307,18,22,1,31,1.40918,0.253174,2,1,0.397461,2018-04-29 11:54:06,2018-03-01 14:25:48,-13.101562,-0.595215,0.083862,0.673828,-0.739258,4.375,0.0,1.0,1.0,9,3,37,14,-13.101562,-0.595215,0.083862,0.673828,-0.739258,4.375,2018,1,2018.0,2018,2018,3,2,3.408203,4,3,16,16,17.140625,96.0,31,1,-0.136963,13,11,13.226562,12.664062,20,6,-0.21228,5,7,3.818359,6,0,11,9,13.0,17,9,0,11,0.5,-13.099166,-0.595417,0.673639,-0.739395,0.083862,3,2,2.044922,0.045441,4.691406,0.0,0.0,0.0,43.1875,-1.21875,-1.879883,1.347656,0.357178,4.0625,-0.292969,-0.369629,0.336914,0.021317,4.179688,-0.550293,-0.588379,58,2.636719,283,342
C_ID_0a6ceeffcc,1,21,1,21,143,5,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,307,14,20,1,29,1.380859,0.247559,2,1,0.528809,2017-12-22 13:52:37,2017-11-02 12:39:37,-14.859375,-0.70752,0.000999,-0.59668,-0.739258,2.347656,0.0,5.0,4.429688,5,3,19,12,-14.859375,-0.70752,0.000999,-0.59668,-0.739258,2.347656,2017,1,2017.0,2017,2017,11,2,11.382812,12,11,11,16,15.765625,70.1875,29,1,-0.075378,0,10,11.335938,40.21875,19,0,-0.888672,4,7,3.857422,6,0,47,8,47.46875,51,44,0,8,0.380859,-inf,-inf,-inf,-inf,,7,6,6.046875,0.047607,4.582031,27.0,0.0,3.761719,0.0,-4.28125,-5.160156,-3.580078,0.075073,-0.824219,-0.117065,-0.12323,-0.099426,3.4e-05,1.860352,-0.630859,-0.541504,50,2.380859,411,461


In [50]:
new_merchant_transactions().to_csv('./data_feature_engineering/new_v1.csv')

Memory usage after optimization is: 125.43 MB
Decreased by 70.6%
Memory usage after optimization is: 69.42 MB
Decreased by 51.4%


### additional_features

In [9]:
df = pd.read_csv('./data_feature_engineering/train_test_v5.csv')
history_df = pd.read_csv('./data_feature_engineering/history_v1.csv')
new_df = pd.read_csv('./data_feature_engineering/new_v1.csv')

In [10]:
result = pd.merge(df, history_df, on='card_id', how='outer')
result = pd.merge(result, new_df, on='card_id', how='outer')

In [11]:
result.head(3)

Unnamed: 0,card_id,feature_1,feature_2,feature_3,first_active_month,outliers,target,first_active,first_active_year,first_active_elapsed_time_from_trade,first_active_total_day,hist_authorized_flag_label_mode,hist_authorized_flag_label_sum,hist_authorized_flag_label_mean,hist_card_id_size,hist_city_id_mode,hist_city_id_nunique,hist_category_1_label_mode,hist_category_1_label_sum,hist_category_1_label_mean,hist_installments_mode,hist_installments_sum,hist_installments_mean,hist_installments_var,hist_installments_max,hist_installments_min,hist_installments_null_cnt,hist_category_3_label_mode,hist_category_3_label_mean,hist_merchant_category_id_mode,hist_merchant_category_id_nunique,hist_merchant_id_nunique,hist_month_lag_mode,hist_month_lag_sum,hist_month_lag_mean,hist_month_lag_var,hist_month_lag_max,hist_month_lag_min,hist_month_lag_skew,hist_purchase_date_max,hist_purchase_date_min,hist_purchase_amount_sum,hist_purchase_amount_mean,hist_purchase_amount_var,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_skew,hist_purchase_amount_over_550,hist_category_2_mode,hist_category_2_mean,hist_state_id_mode,hist_state_id_nunique,hist_subsector_id_mode,hist_subsector_id_nunique,hist_purchase_amount_trim_sum,hist_purchase_amount_trim_mean,hist_purchase_amount_trim_var,hist_purchase_amount_trim_max,hist_purchase_amount_trim_min,hist_purchase_amount_trim_skew,hist_purchase_year_mode,hist_purchase_year_nunique,hist_purchase_year_mean,hist_purchase_year_max,hist_purchase_year_min,hist_purchase_month_mode,hist_purchase_month_nunique,hist_purchase_month_mean,hist_purchase_month_max,hist_purchase_month_min,hist_purchase_day_mode,hist_purchase_day_nunique,hist_purchase_day_mean,hist_purchase_day_var,hist_purchase_day_max,hist_purchase_day_min,hist_purchase_day_skew,hist_purchase_hour_mode,hist_purchase_hour_nunique,hist_purchase_hour_mean,hist_purchase_hour_var,hist_purchase_hour_max,hist_purchase_hour_min,hist_purchase_hour_skew,hist_purchase_dayofweek_mode,hist_purchase_dayofweek_nunique,hist_purchase_dayofweek_mean,hist_purchase_dayofweek_max,hist_purchase_dayofweek_min,hist_purchase_weekofyear_mode,hist_purchase_weekofyear_nunique,hist_purchase_weekofyear_mean,hist_purchase_weekofyear_max,hist_purchase_weekofyear_min,hist_purchase_weekend_mode,hist_purchase_weekend_sum,hist_purchase_weekend_mean,hist_price_sum,hist_price_mean,hist_price_max,hist_price_min,hist_price_var,hist_month_diff_max,hist_month_diff_min,hist_month_diff_mean,hist_month_diff_var,hist_month_diff_skew,hist_Christmas_Day_2017_mean,hist_Mothers_Day_2017_mean,hist_fathers_day_2017_mean,hist_Children_day_2017_mean,hist_Valentine_Day_2017_mean,hist_Black_Friday_2017_mean,hist_Mothers_Day_2018_mean,hist_duration_mean,hist_duration_min,hist_duration_max,hist_duration_var,hist_duration_skew,hist_amount_month_ratio_mean,hist_amount_month_ratio_min,hist_amount_month_ratio_max,hist_amount_month_ratio_var,hist_amount_month_ratio_skew,hist_category_2_mean_mean,hist_category_2_min_mean,hist_category_2_max_mean,hist_category_2_sum_mean,hist_category_3_mean_mean,hist_category_3_min_mean,hist_category_3_max_mean,hist_category_3_sum_mean,hist_purchase_date_diff,hist_purchase_date_average,hist_purchase_date_uptonow,hist_purchase_date_uptomin,new_authorized_flag_mode,new_authorized_flag_sum,new_authorized_flag_mean,new_card_id_size,new_city_id_mode,new_city_id_nunique,new_category_1_mode,new_category_1_sum,new_category_1_mean,new_installments_mode,new_installments_sum,new_installments_mean,new_installments_var,new_installments_max,new_installments_min,new_installments_null_cnt,new_category_3_mode,new_category_3_mean,new_merchant_category_id_mode,new_merchant_category_id_nunique,new_merchant_id_nunique,new_month_lag_mode,new_month_lag_sum,new_month_lag_mean,new_month_lag_var,new_month_lag_max,new_month_lag_min,new_month_lag_skew,new_purchase_date_max,new_purchase_date_min,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_var,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_skew,new_purchase_amount_over_550,new_category_2_mode,new_category_2_mean,new_state_id_mode,new_state_id_nunique,new_subsector_id_mode,new_subsector_id_nunique,new_purchase_amount_trim_sum,new_purchase_amount_trim_mean,new_purchase_amount_trim_var,new_purchase_amount_trim_max,new_purchase_amount_trim_min,new_purchase_amount_trim_skew,new_purchase_year_mode,new_purchase_year_nunique,new_purchase_year_mean,new_purchase_year_max,new_purchase_year_min,new_purchase_month_mode,new_purchase_month_nunique,new_purchase_month_mean,new_purchase_month_max,new_purchase_month_min,new_purchase_day_mode,new_purchase_day_nunique,new_purchase_day_mean,new_purchase_day_var,new_purchase_day_max,new_purchase_day_min,new_purchase_day_skew,new_purchase_hour_mode,new_purchase_hour_nunique,new_purchase_hour_mean,new_purchase_hour_var,new_purchase_hour_max,new_purchase_hour_min,new_purchase_hour_skew,new_purchase_dayofweek_mode,new_purchase_dayofweek_nunique,new_purchase_dayofweek_mean,new_purchase_dayofweek_max,new_purchase_dayofweek_min,new_purchase_weekofyear_mode,new_purchase_weekofyear_nunique,new_purchase_weekofyear_mean,new_purchase_weekofyear_max,new_purchase_weekofyear_min,new_purchase_weekend_mode,new_purchase_weekend_sum,new_purchase_weekend_mean,new_price_sum,new_price_mean,new_price_max,new_price_min,new_price_var,new_month_diff_max,new_month_diff_min,new_month_diff_mean,new_month_diff_var,new_month_diff_skew,new_Christmas_Day_2017_mean,new_Children_day_2017_mean,new_Black_Friday_2017_mean,new_Mothers_Day_2018_mean,new_duration_mean,new_duration_min,new_duration_max,new_duration_var,new_duration_skew,new_amount_month_ratio_mean,new_amount_month_ratio_min,new_amount_month_ratio_max,new_amount_month_ratio_var,new_amount_month_ratio_skew,new_category_2_mean_mean,new_category_3_mean_mean,new_purchase_date_diff,new_purchase_date_average,new_purchase_date_uptonow,new_purchase_date_uptomin
0,C_ID_92a2005557,5,2,1,6,0.0,-0.820283,2017-06-01,2017,333,17318.0,1,247.0,0.95,260,69,7,0,0.0,0.0,0.0,4.0,0.01538,0.015205,1.0,0.0,0.0,0,0.01538,560,41,94,-2,-1017.0,-3.912,5.75,0,-8,0.066,2018-02-25 09:31:15,2017-06-27 14:18:08,-165.96873,-0.638341,0.045003,2.258395,-0.7393,10.24,0.0,1.0,1.046,9,3,34,21,-167.4,-0.644,0.02057,0.8,-0.7393,5.133,2017,2,2017.0,2018,2017,12,9,8.055,12,1,11,31,15.51,76.9,31,1,0.10236,14,23,13.31,24.69,23,0,-0.887,5,7,3.21,6,0,50,35,33.06,52,1,0,90.0,0.3462,,,inf,-inf,,3,2,2.072,0.068,3.299,13.125,0.0,6.266,26.77,0.0,16.47,7.754,-1.325042,-2.201,4.516789,0.216691,7.734,-0.311,-0.3696,1.129197,0.011823,9.35,0.0725,-0.747,5942464.5,1309718.6,0.3467,-0.747,5920398.5,5429670.5,242,0.9307,346,589,1.0,23.0,1.0,23.0,69.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,278.0,14.0,23.0,1.0,34.0,1.479,0.261,2.0,1.0,0.09326,2018-04-29 11:23:05,2018-03-05 14:04:36,-13.24,-0.5757,0.01843,-0.2961,-0.7246,0.895996,0.0,1.0,1.0,9.0,1.0,37.0,10.0,-13.24,-0.5757,0.01843,-0.2961,-0.7246,0.896,2018.0,1.0,2018.0,2018.0,2018.0,3.0,2.0,3.479,4.0,3.0,6.0,17.0,16.44,88.8,31.0,5.0,0.3389,13.0,8.0,12.87,4.21,16.0,8.0,-0.6035,4.0,7.0,3.13,6.0,0.0,13.0,7.0,13.305,17.0,10.0,0.0,6.0,0.261,-inf,-inf,-inf,-inf,,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,41.75,-1.151,-1.449,-0.5923,0.0737,0.895996,-0.2878,-0.3623,-0.1481,0.00461,0.895996,-0.5503,-0.593,54.0,2.348,283.0,338.0
1,C_ID_3d0044924f,4,1,0,1,0.0,0.392913,2017-01-01,2017,484,17167.0,1,339.0,0.9688,350,69,9,0,31.0,0.08856,1.0,545.0,1.566,2.258,10.0,1.0,2.0,1,1.2,307,57,142,0,-1761.0,-5.03,14.48,0,-12,-0.258,2018-01-31 22:31:09,2017-01-06 16:29:42,-210.00633,-0.600018,0.1482,4.6303,-0.742,8.81,0.0,1.0,1.0,9,3,34,24,-215.4,-0.615,0.0586,0.8,-0.742,3.744,2017,2,2017.0,2018,2017,1,12,6.22,12,1,19,31,16.67,77.4,31,1,-0.2357,12,24,14.72,31.16,23,0,-0.8936,5,7,3.363,6,0,3,50,25.22,52,1,0,132.0,0.3772,-200.113283,-0.575038,2.31515,-0.7424,0.082442,4,2,3.092,0.1005,2.0,10.65,8.734,12.555,13.79,6.57,9.836,0.0,-1.853766,-2.951,13.890898,1.405803,8.25,-0.196,-0.3667,1.543433,0.016649,8.555,0.0746,-0.747,6010604.0,1329550.4,-0.2952,-0.747,156963.11,-3653208.0,390,1.114,370,761,1.0,6.0,1.0,6.0,69.0,1.0,0.0,0.0,0.0,1.0,6.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,307.0,5.0,6.0,1.0,9.0,1.5,0.3,2.0,1.0,0.0,2018-03-30 06:48:26,2018-02-01 17:07:54,-4.355,-0.726,0.000207,-0.7017,-0.7393,1.255859,0.0,1.0,1.0,9.0,1.0,19.0,4.0,-4.355,-0.726,0.000207,-0.7017,-0.7393,1.256,2018.0,1.0,2018.0,2018.0,2018.0,2.0,2.0,2.5,3.0,2.0,5.0,4.0,13.5,131.5,30.0,1.0,0.37,17.0,5.0,11.164,24.56,17.0,6.0,0.3833,0.0,4.0,1.5,4.0,0.0,6.0,4.0,9.0,13.0,5.0,0.0,0.0,0.0,-4.355735,-0.725956,-0.701858,-0.73941,0.000205,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,56.84,-2.178,-2.219,-2.105,0.001841,1.180664,-0.242,-0.2465,-0.234,2.3e-05,1.151367,-0.5503,-0.6064,56.0,9.336,313.0,370.0
2,C_ID_d639edf6cd,2,2,0,8,0.0,0.688056,2016-08-01,2016,637,17014.0,1,41.0,0.9536,43,143,5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,705,8,13,-10,-370.0,-8.6,14.766,0,-13,0.7256,2018-02-27 19:08:25,2017-01-11 08:21:22,-29.16739,-0.678311,0.007635,-0.145847,-0.73,5.625,0.0,5.0,4.63,5,2,33,7,-29.17,-0.678,0.007637,-0.1459,-0.73,5.62,2017,2,2017.0,2018,2017,1,10,4.56,12,1,21,19,19.33,62.3,30,2,-0.7563,19,14,17.9,12.375,23,8,-0.8867,4,7,3.303,6,0,4,22,18.38,49,2,0,11.0,0.2559,-inf,-inf,-inf,-inf,,3,2,2.07,0.06647,3.5,9.67,17.0,8.91,7.51,17.66,8.91,1.721,-1.406243,-2.182,-0.291695,0.070888,0.3125,-0.3308,-0.365,-0.072924,0.002562,3.719,-0.0878,-0.747,654527.7,-229313.4,0.3584,-0.747,6010604.0,5588325.0,412,9.58,344,756,1.0,1.0,1.0,1.0,143.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,528.0,1.0,1.0,2.0,2.0,2.0,,2.0,2.0,,2018-04-28 17:43:11,2018-04-28 17:43:11,-0.7,-0.7,,-0.7,-0.7,,0.0,5.0,5.0,5.0,1.0,25.0,1.0,-0.7,-0.7,,-0.7,-0.7,,2018.0,1.0,2018.0,2018.0,2018.0,4.0,1.0,4.0,4.0,4.0,28.0,1.0,28.0,,28.0,28.0,,17.0,1.0,17.0,,17.0,17.0,,5.0,1.0,5.0,5.0,5.0,17.0,1.0,17.0,17.0,17.0,1.0,1.0,1.0,-inf,-inf,-inf,-inf,,2.0,2.0,2.0,,,0.0,0.0,0.0,14.0,-1.4,-1.4,-1.4,,,-0.35,-0.35,-0.35,,,-0.549,-0.593,0.0,0.0,284.0,284.0


In [12]:
result.shape

(325540, 252)

In [13]:
# additional features
def additional_features(df):
    df['hist_first_buy'] = (pd.to_datetime(df['hist_purchase_date_min']) - pd.to_datetime(df['first_active'])).dt.days
    df['hist_last_buy'] = (pd.to_datetime(df['hist_purchase_date_max']) - pd.to_datetime(df['first_active'])).dt.days
    df['new_first_buy'] = (pd.to_datetime(df['new_purchase_date_min']) - pd.to_datetime(df['first_active'])).dt.days
    df['new_last_buy'] = (pd.to_datetime(df['new_purchase_date_max']) - pd.to_datetime(df['first_active'])).dt.days

    date_features=['hist_purchase_date_max','hist_purchase_date_min', 'new_purchase_date_max', 'new_purchase_date_min']

    for f in date_features:
        df[f] = pd.to_datetime(df[f]).astype(np.int64) * 1e-9

    df['card_id_total_size'] = df['new_card_id_size']+df['hist_card_id_size']
    df['card_id_size_ratio'] = df['new_card_id_size']/df['hist_card_id_size']
    df['purchase_amount_total'] = df['new_purchase_amount_sum']+df['hist_purchase_amount_sum']
    df['purchase_amount_mean'] = df['new_purchase_amount_mean']+df['hist_purchase_amount_mean']
    df['purchase_amount_max'] = df['new_purchase_amount_max']+df['hist_purchase_amount_max']
    df['purchase_amount_min'] = df['new_purchase_amount_min']+df['hist_purchase_amount_min']
    df['purchase_amount_ratio'] = df['new_purchase_amount_sum']/df['hist_purchase_amount_sum']
    df['month_diff_mean'] = df['new_month_diff_mean']+df['hist_month_diff_mean']
    df['month_diff_ratio'] = df['new_month_diff_mean']/df['hist_month_diff_mean']
    df['month_lag_mean'] = df['new_month_lag_mean']+df['hist_month_lag_mean']
    df['month_lag_max'] = df['new_month_lag_max']+df['hist_month_lag_max']
    df['month_lag_min'] = df['new_month_lag_min']+df['hist_month_lag_min']
    df['category_1_mean'] = df['new_category_1_mean']+df['hist_category_1_label_mean']
    df['installments_total'] = df['new_installments_sum']+df['hist_installments_sum']
    df['installments_mean'] = df['new_installments_mean']+df['hist_installments_mean']
    df['installments_max'] = df['new_installments_max']+df['hist_installments_max']
    df['installments_ratio'] = df['new_installments_sum']/df['hist_installments_sum']
    df['price_total'] = df['purchase_amount_total'] / df['installments_total']
    df['price_mean'] = df['purchase_amount_mean'] / df['installments_mean']
    df['price_max'] = df['purchase_amount_max'] / df['installments_max']
    df['duration_mean'] = df['new_duration_mean']+df['hist_duration_mean']
    df['duration_min'] = df['new_duration_min']+df['hist_duration_min']
    df['duration_max'] = df['new_duration_max']+df['hist_duration_max']
    df['amount_month_ratio_mean']=df['new_amount_month_ratio_mean']+df['hist_amount_month_ratio_mean']
    df['amount_month_ratio_min']=df['new_amount_month_ratio_min']+df['hist_amount_month_ratio_min']
    df['amount_month_ratio_max']=df['new_amount_month_ratio_max']+df['hist_amount_month_ratio_max']
    df['new_CLV'] = df['new_card_id_size'] * df['new_purchase_amount_sum'] / df['new_month_diff_mean']
    df['hist_CLV'] = df['hist_card_id_size'] * df['hist_purchase_amount_sum'] / df['hist_month_diff_mean']
    df['CLV_ratio'] = df['new_CLV'] / df['hist_CLV']

    return df

In [14]:
additional_features(result).head(3)

Unnamed: 0,card_id,feature_1,feature_2,feature_3,first_active_month,outliers,target,first_active,first_active_year,first_active_elapsed_time_from_trade,first_active_total_day,hist_authorized_flag_label_mode,hist_authorized_flag_label_sum,hist_authorized_flag_label_mean,hist_card_id_size,hist_city_id_mode,hist_city_id_nunique,hist_category_1_label_mode,hist_category_1_label_sum,hist_category_1_label_mean,hist_installments_mode,hist_installments_sum,hist_installments_mean,hist_installments_var,hist_installments_max,hist_installments_min,hist_installments_null_cnt,hist_category_3_label_mode,hist_category_3_label_mean,hist_merchant_category_id_mode,hist_merchant_category_id_nunique,hist_merchant_id_nunique,hist_month_lag_mode,hist_month_lag_sum,hist_month_lag_mean,hist_month_lag_var,hist_month_lag_max,hist_month_lag_min,hist_month_lag_skew,hist_purchase_date_max,hist_purchase_date_min,hist_purchase_amount_sum,hist_purchase_amount_mean,hist_purchase_amount_var,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_skew,hist_purchase_amount_over_550,hist_category_2_mode,hist_category_2_mean,hist_state_id_mode,hist_state_id_nunique,hist_subsector_id_mode,hist_subsector_id_nunique,hist_purchase_amount_trim_sum,hist_purchase_amount_trim_mean,hist_purchase_amount_trim_var,hist_purchase_amount_trim_max,hist_purchase_amount_trim_min,hist_purchase_amount_trim_skew,hist_purchase_year_mode,hist_purchase_year_nunique,hist_purchase_year_mean,hist_purchase_year_max,hist_purchase_year_min,hist_purchase_month_mode,hist_purchase_month_nunique,hist_purchase_month_mean,hist_purchase_month_max,hist_purchase_month_min,hist_purchase_day_mode,hist_purchase_day_nunique,hist_purchase_day_mean,hist_purchase_day_var,hist_purchase_day_max,hist_purchase_day_min,hist_purchase_day_skew,hist_purchase_hour_mode,hist_purchase_hour_nunique,hist_purchase_hour_mean,hist_purchase_hour_var,hist_purchase_hour_max,hist_purchase_hour_min,hist_purchase_hour_skew,hist_purchase_dayofweek_mode,hist_purchase_dayofweek_nunique,hist_purchase_dayofweek_mean,hist_purchase_dayofweek_max,hist_purchase_dayofweek_min,hist_purchase_weekofyear_mode,hist_purchase_weekofyear_nunique,hist_purchase_weekofyear_mean,hist_purchase_weekofyear_max,hist_purchase_weekofyear_min,hist_purchase_weekend_mode,hist_purchase_weekend_sum,hist_purchase_weekend_mean,hist_price_sum,hist_price_mean,hist_price_max,hist_price_min,hist_price_var,hist_month_diff_max,hist_month_diff_min,hist_month_diff_mean,hist_month_diff_var,hist_month_diff_skew,hist_Christmas_Day_2017_mean,hist_Mothers_Day_2017_mean,hist_fathers_day_2017_mean,hist_Children_day_2017_mean,hist_Valentine_Day_2017_mean,hist_Black_Friday_2017_mean,hist_Mothers_Day_2018_mean,hist_duration_mean,hist_duration_min,hist_duration_max,hist_duration_var,hist_duration_skew,hist_amount_month_ratio_mean,hist_amount_month_ratio_min,hist_amount_month_ratio_max,hist_amount_month_ratio_var,hist_amount_month_ratio_skew,hist_category_2_mean_mean,hist_category_2_min_mean,hist_category_2_max_mean,hist_category_2_sum_mean,hist_category_3_mean_mean,hist_category_3_min_mean,hist_category_3_max_mean,hist_category_3_sum_mean,hist_purchase_date_diff,hist_purchase_date_average,hist_purchase_date_uptonow,hist_purchase_date_uptomin,new_authorized_flag_mode,new_authorized_flag_sum,new_authorized_flag_mean,new_card_id_size,new_city_id_mode,new_city_id_nunique,new_category_1_mode,new_category_1_sum,new_category_1_mean,new_installments_mode,new_installments_sum,new_installments_mean,new_installments_var,new_installments_max,new_installments_min,new_installments_null_cnt,new_category_3_mode,new_category_3_mean,new_merchant_category_id_mode,new_merchant_category_id_nunique,new_merchant_id_nunique,new_month_lag_mode,new_month_lag_sum,new_month_lag_mean,new_month_lag_var,new_month_lag_max,new_month_lag_min,new_month_lag_skew,new_purchase_date_max,new_purchase_date_min,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_var,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_skew,new_purchase_amount_over_550,new_category_2_mode,new_category_2_mean,new_state_id_mode,new_state_id_nunique,new_subsector_id_mode,new_subsector_id_nunique,new_purchase_amount_trim_sum,new_purchase_amount_trim_mean,new_purchase_amount_trim_var,new_purchase_amount_trim_max,new_purchase_amount_trim_min,new_purchase_amount_trim_skew,new_purchase_year_mode,new_purchase_year_nunique,new_purchase_year_mean,new_purchase_year_max,new_purchase_year_min,new_purchase_month_mode,new_purchase_month_nunique,new_purchase_month_mean,new_purchase_month_max,new_purchase_month_min,new_purchase_day_mode,new_purchase_day_nunique,new_purchase_day_mean,new_purchase_day_var,new_purchase_day_max,new_purchase_day_min,new_purchase_day_skew,new_purchase_hour_mode,new_purchase_hour_nunique,new_purchase_hour_mean,new_purchase_hour_var,new_purchase_hour_max,new_purchase_hour_min,new_purchase_hour_skew,new_purchase_dayofweek_mode,new_purchase_dayofweek_nunique,new_purchase_dayofweek_mean,new_purchase_dayofweek_max,new_purchase_dayofweek_min,new_purchase_weekofyear_mode,new_purchase_weekofyear_nunique,new_purchase_weekofyear_mean,new_purchase_weekofyear_max,new_purchase_weekofyear_min,new_purchase_weekend_mode,new_purchase_weekend_sum,new_purchase_weekend_mean,new_price_sum,new_price_mean,new_price_max,new_price_min,new_price_var,new_month_diff_max,new_month_diff_min,new_month_diff_mean,new_month_diff_var,new_month_diff_skew,new_Christmas_Day_2017_mean,new_Children_day_2017_mean,new_Black_Friday_2017_mean,new_Mothers_Day_2018_mean,new_duration_mean,new_duration_min,new_duration_max,new_duration_var,new_duration_skew,new_amount_month_ratio_mean,new_amount_month_ratio_min,new_amount_month_ratio_max,new_amount_month_ratio_var,new_amount_month_ratio_skew,new_category_2_mean_mean,new_category_3_mean_mean,new_purchase_date_diff,new_purchase_date_average,new_purchase_date_uptonow,new_purchase_date_uptomin,hist_first_buy,hist_last_buy,new_first_buy,new_last_buy,card_id_total_size,card_id_size_ratio,purchase_amount_total,purchase_amount_mean,purchase_amount_max,purchase_amount_min,purchase_amount_ratio,month_diff_mean,month_diff_ratio,month_lag_mean,month_lag_max,month_lag_min,category_1_mean,installments_total,installments_mean,installments_max,installments_ratio,price_total,price_mean,price_max,duration_mean,duration_min,duration_max,amount_month_ratio_mean,amount_month_ratio_min,amount_month_ratio_max,new_CLV,hist_CLV,CLV_ratio
0,C_ID_92a2005557,5,2,1,6,0.0,-0.820283,2017-06-01,2017,333,17318.0,1,247.0,0.95,260,69,7,0,0.0,0.0,0.0,4.0,0.01538,0.015205,1.0,0.0,0.0,0,0.01538,560,41,94,-2,-1017.0,-3.912,5.75,0,-8,0.066,1519551000.0,1498573000.0,-165.96873,-0.638341,0.045003,2.258395,-0.7393,10.24,0.0,1.0,1.046,9,3,34,21,-167.4,-0.644,0.02057,0.8,-0.7393,5.133,2017,2,2017.0,2018,2017,12,9,8.055,12,1,11,31,15.51,76.9,31,1,0.10236,14,23,13.31,24.69,23,0,-0.887,5,7,3.21,6,0,50,35,33.06,52,1,0,90.0,0.3462,,,inf,-inf,,3,2,2.072,0.068,3.299,13.125,0.0,6.266,26.77,0.0,16.47,7.754,-1.325042,-2.201,4.516789,0.216691,7.734,-0.311,-0.3696,1.129197,0.011823,9.35,0.0725,-0.747,5942464.5,1309718.6,0.3467,-0.747,5920398.5,5429670.5,242,0.9307,346,589,1.0,23.0,1.0,23.0,69.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,278.0,14.0,23.0,1.0,34.0,1.479,0.261,2.0,1.0,0.09326,1525001000.0,1520259000.0,-13.24,-0.5757,0.01843,-0.2961,-0.7246,0.895996,0.0,1.0,1.0,9.0,1.0,37.0,10.0,-13.24,-0.5757,0.01843,-0.2961,-0.7246,0.896,2018.0,1.0,2018.0,2018.0,2018.0,3.0,2.0,3.479,4.0,3.0,6.0,17.0,16.44,88.8,31.0,5.0,0.3389,13.0,8.0,12.87,4.21,16.0,8.0,-0.6035,4.0,7.0,3.13,6.0,0.0,13.0,7.0,13.305,17.0,10.0,0.0,6.0,0.261,-inf,-inf,-inf,-inf,,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,41.75,-1.151,-1.449,-0.5923,0.0737,0.895996,-0.2878,-0.3623,-0.1481,0.00461,0.895996,-0.5503,-0.593,54.0,2.348,283.0,338.0,26,269,277.0,332.0,283.0,0.088462,-179.20873,-1.214041,1.962295,-1.4639,0.079774,4.072,0.965251,-2.433,2.0,-7.0,0.0,4.0,0.01538,1.0,0.0,-44.802183,-78.936365,1.962295,-2.476042,-3.65,3.924489,-0.5988,-0.7319,0.981097,-152.26,-20826.191988,0.007311
1,C_ID_3d0044924f,4,1,0,1,0.0,0.392913,2017-01-01,2017,484,17167.0,1,339.0,0.9688,350,69,9,0,31.0,0.08856,1.0,545.0,1.566,2.258,10.0,1.0,2.0,1,1.2,307,57,142,0,-1761.0,-5.03,14.48,0,-12,-0.258,1517438000.0,1483720000.0,-210.00633,-0.600018,0.1482,4.6303,-0.742,8.81,0.0,1.0,1.0,9,3,34,24,-215.4,-0.615,0.0586,0.8,-0.742,3.744,2017,2,2017.0,2018,2017,1,12,6.22,12,1,19,31,16.67,77.4,31,1,-0.2357,12,24,14.72,31.16,23,0,-0.8936,5,7,3.363,6,0,3,50,25.22,52,1,0,132.0,0.3772,-200.113283,-0.575038,2.31515,-0.7424,0.082442,4,2,3.092,0.1005,2.0,10.65,8.734,12.555,13.79,6.57,9.836,0.0,-1.853766,-2.951,13.890898,1.405803,8.25,-0.196,-0.3667,1.543433,0.016649,8.555,0.0746,-0.747,6010604.0,1329550.4,-0.2952,-0.747,156963.11,-3653208.0,390,1.114,370,761,1.0,6.0,1.0,6.0,69.0,1.0,0.0,0.0,0.0,1.0,6.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,307.0,5.0,6.0,1.0,9.0,1.5,0.3,2.0,1.0,0.0,1522393000.0,1517505000.0,-4.355,-0.726,0.000207,-0.7017,-0.7393,1.255859,0.0,1.0,1.0,9.0,1.0,19.0,4.0,-4.355,-0.726,0.000207,-0.7017,-0.7393,1.256,2018.0,1.0,2018.0,2018.0,2018.0,2.0,2.0,2.5,3.0,2.0,5.0,4.0,13.5,131.5,30.0,1.0,0.37,17.0,5.0,11.164,24.56,17.0,6.0,0.3833,0.0,4.0,1.5,4.0,0.0,6.0,4.0,9.0,13.0,5.0,0.0,0.0,0.0,-4.355735,-0.725956,-0.701858,-0.73941,0.000205,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,56.84,-2.178,-2.219,-2.105,0.001841,1.180664,-0.242,-0.2465,-0.234,2.3e-05,1.151367,-0.5503,-0.6064,56.0,9.336,313.0,370.0,5,395,396.0,453.0,356.0,0.017143,-214.36133,-1.326018,3.9286,-1.4813,0.020737,6.092,0.970246,-3.53,2.0,-11.0,0.08856,551.0,2.566,11.0,0.011009,-0.389041,-0.516765,0.357145,-4.031766,-5.17,11.785898,-0.438,-0.6132,1.309433,-8.71,-23771.738519,0.000366
2,C_ID_d639edf6cd,2,2,0,8,0.0,0.688056,2016-08-01,2016,637,17014.0,1,41.0,0.9536,43,143,5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,705,8,13,-10,-370.0,-8.6,14.766,0,-13,0.7256,1519759000.0,1484123000.0,-29.16739,-0.678311,0.007635,-0.145847,-0.73,5.625,0.0,5.0,4.63,5,2,33,7,-29.17,-0.678,0.007637,-0.1459,-0.73,5.62,2017,2,2017.0,2018,2017,1,10,4.56,12,1,21,19,19.33,62.3,30,2,-0.7563,19,14,17.9,12.375,23,8,-0.8867,4,7,3.303,6,0,4,22,18.38,49,2,0,11.0,0.2559,-inf,-inf,-inf,-inf,,3,2,2.07,0.06647,3.5,9.67,17.0,8.91,7.51,17.66,8.91,1.721,-1.406243,-2.182,-0.291695,0.070888,0.3125,-0.3308,-0.365,-0.072924,0.002562,3.719,-0.0878,-0.747,654527.7,-229313.4,0.3584,-0.747,6010604.0,5588325.0,412,9.58,344,756,1.0,1.0,1.0,1.0,143.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,528.0,1.0,1.0,2.0,2.0,2.0,,2.0,2.0,,1524937000.0,1524937000.0,-0.7,-0.7,,-0.7,-0.7,,0.0,5.0,5.0,5.0,1.0,25.0,1.0,-0.7,-0.7,,-0.7,-0.7,,2018.0,1.0,2018.0,2018.0,2018.0,4.0,1.0,4.0,4.0,4.0,28.0,1.0,28.0,,28.0,28.0,,17.0,1.0,17.0,,17.0,17.0,,5.0,1.0,5.0,5.0,5.0,17.0,1.0,17.0,17.0,17.0,1.0,1.0,1.0,-inf,-inf,-inf,-inf,,2.0,2.0,2.0,,,0.0,0.0,0.0,14.0,-1.4,-1.4,-1.4,,,-0.35,-0.35,-0.35,,,-0.549,-0.593,0.0,0.0,284.0,284.0,163,575,635.0,635.0,44.0,0.023256,-29.86739,-1.378311,-0.845847,-1.43,0.023999,4.07,0.966184,-6.6,2.0,-11.0,0.0,0.0,0.0,0.0,,-inf,-inf,-inf,-2.806243,-3.582,-1.691695,-0.6808,-0.715,-0.422924,-0.35,-605.892643,0.000578


In [15]:
additional_features(result).shape

(325540, 285)

In [16]:
result = additional_features(result)

In [17]:
train_df = result[result['target'].notnull()]
test_df = result[result['target'].isnull()]

In [18]:
train_df.set_index('card_id').to_csv('./data/train_v4.csv')
test_df.set_index('card_id').to_csv('./data/test_v4.csv')