In [2]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import gc

from sklearn.model_selection import KFold, TimeSeriesSplit, StratifiedKFold, GroupKFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import datetime

import warnings
warnings.filterwarnings("ignore")

In [3]:
address = '../../input/'

In [4]:
train_transaction = pd.read_csv(address + 'train_transaction.csv')
train_identity = pd.read_csv(address + 'train_identity.csv')
test_transaction = pd.read_csv(address + 'test_transaction.csv')
test_identity = pd.read_csv(address + 'test_identity.csv')

def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout
    #plt.savefig('lgbm_importances01.png')

train = train_transaction.merge(train_identity, how='left', on='TransactionID')
test = test_transaction.merge(test_identity, how='left', on='TransactionID')

test_id = test[['TransactionID']].copy()

del train_transaction, train_identity, test_transaction, test_identity
gc.collect()

train.sort_values(by='TransactionDT', ascending=True, inplace=True)
test.sort_values(by='TransactionDT', ascending=True, inplace=True)

train_shape = train.shape[0]
data = pd.concat([train, test], axis=0)
target = train['isFraud']
del train, test
gc.collect()

print('work on TransactionDT')

START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
data['TransactionDT_date'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
#based on the information get from this time value, D9 is the hour
data['hour'] = data['TransactionDT_date'].dt.hour
data['hour_sin'] = np.sin(data['D9']*2*np.pi/24)

data['week'] = data['TransactionDT_date'].dt.week
data['week_sin']=np.sin(data['week']%52*2*np.pi/52)
data['week_cos']=np.cos(data['week']%52*2*np.pi/52)

data['month'] = data['TransactionDT_date'].dt.month

data['DaysFromStart'] = np.ceil(data['TransactionDT']/(60*60*24))
data['life_of_customer'] = data['DaysFromStart'] - data['D1']



del data['TransactionDT_date'], data['D9']
gc.collect()

del data['V305']
gc.collect()


data['addr1_broad_area'] = data['addr1'].apply(str).apply(lambda x:x[0]).replace({'n':'6'}).apply(int)

data['uid'] = data['card1'].astype(str)+'_' + data['card2'].astype(str)
data['uid2'] = data['uid'].astype(str)+'_' + data['card3'].astype(str) + '_' + data['card5'].astype(str)
data['uid3'] = data['uid2'].astype(str) + '_' + data['addr1'].astype(str)+'_' + data['addr2'].astype(str)
data['uid4'] = data['uid3'].astype(str)+'_' + data['card4'].astype(str) + '_' + data['card6'].astype(str)
data['uid5'] = data['uid4'].astype(str)+'_' + data['M4'].astype(str)
data['uid6'] = data['uid5'].astype(str)+'_' + data['P_emaildomain'].astype(str)


for col in ['card1', 'card2', 'card3', 'card5', 'uid', 'uid2', 'uid3', 'uid4', 'uid5', 'uid6', 'hour', 'week']:
    tmp = data.groupby(col).agg({'TransactionAmt': ['mean', 'std']})
    tmp.columns = pd.Index([col+'_'+e[0]+'_'+e[1] for e in tmp.columns])   
    tmp = tmp.reset_index()
    data = data.merge(tmp, how='left', on=col)
    del tmp
    gc.collect()
    
le = LabelEncoder()
for col in ['M5', 'M6']:
    data[col] = le.fit_transform(data[col].apply(str))  #因为存在missing的原因
    
data['new_identity'] = data.uid4.astype(str) + '_' + data.P_emaildomain.astype(str) + '_' + data.life_of_customer.astype(str)    

print('fill out D2 information')
data['D2_constant'] = data['DaysFromStart'] - data['D2']

i_cols = ['D2']
#D2的missing从46.98%变成35.53%
##https://www.kaggle.com/kyakovlev/ieee-card1-card6
for col in i_cols:
    col1 = col+'_constant'
    temp_df = data.groupby(['new_identity',col1])[col1].agg(['count']).reset_index()
    temp_df = temp_df.sort_values(by=['new_identity','count'], ascending=False).reset_index(drop=True)
    del temp_df['count']
    temp_df = temp_df.drop_duplicates(keep='first').reset_index(drop=True)
    temp_df.index = temp_df['new_identity'].values
    temp_df = temp_df[col1].to_dict()
    data[col1] = np.where(data[col1].isna(), data['new_identity'].map(temp_df), data[col1])
    data[col] = data['DaysFromStart'] - data[col1]
    data[col] = np.where(data[col]<0, np.nan, data[col])  #时差还是不要为负的
    del temp_df, data[col1]
    gc.collect()

data['D1_diff'] = data.groupby(['new_identity'])['D1'].diff()

data['D10_diff'] = data.groupby(['new_identity','ProductCD'])['D10'].diff()
data['D15_diff'] = data.groupby(['new_identity','ProductCD'])['D15'].diff()

tmp = data.groupby(['new_identity', 'addr1', 'addr2']).agg({'M5': ['mean'],
                                                          'M6': ['mean'],
                                                          'V315': ['mean'],
                                                          'D1_diff':['mean'],
                                                          'D3':['mean'],
                                                          'D10_diff':['mean'],
                                                          'D15_diff':['mean']})
tmp.columns = pd.Index(['new_identity_'+col[0]+'_'+col[1] for col in tmp.columns])
tmp = tmp.reset_index()
tmp1 = tmp.groupby(['addr1', 'addr2']).agg({'new_identity_M5_mean': ['mean', 'std'],
                                          'new_identity_M6_mean': ['mean', 'std'],
                                          'new_identity_V315_mean': ['mean', 'std'],
                                          'new_identity_D1_diff_mean': ['mean', 'std'],
                                          'new_identity_D10_diff_mean': ['mean', 'std'],
                                          'new_identity_D15_diff_mean': ['mean', 'std'],})
tmp1.columns = pd.Index(['addr1_addr2_'+col[0]+'_'+col[1] for col in tmp1.columns])
tmp1 = tmp1.reset_index()
data = data.merge(tmp, on=['new_identity', 'addr1', 'addr2'], how='left')
data = data.merge(tmp1, on=['addr1', 'addr2'], how='left')

tmp = data.groupby(['new_identity', 'ProductCD']).agg({'TransactionAmt':['mean']})
tmp.columns = pd.Index(['new_identity_ProductCD_'+col[0]+'_'+col[1] for col in tmp.columns])
tmp = tmp.reset_index()
data = data.merge(tmp, on=['new_identity','ProductCD'], how='left')

del data['new_identity'], tmp, tmp1, data['DaysFromStart']
gc.collect()

#data['R_emaildomain'] = data.R_emaildomain.replace('gmail', 'gmail.com')
#data['P_emaildomain'] = data.P_emaildomain.replace('gmail', 'gmail.com')
    
tmp = data.groupby('uid6').agg({'C1':['mean', 'std'],
                                 'V54': ['mean', 'std'],
                                 'V281': ['mean', 'std'],
                                 'C11': ['mean', 'std'],
                                 'D4': ['mean', 'std'],
                                 'V67': ['mean', 'std'],
                                 'V320': ['mean', 'std'],
                                 'M5': ['mean', 'std'],
                                 'M6': ['mean', 'std'],})
                                 
tmp.columns = pd.Index(['uid6_'+e[0]+'_'+e[1] for e in tmp.columns.tolist()])
tmp = tmp.reset_index()
data = data.merge(tmp, on='uid6', how='left')
del tmp
gc.collect()


tmp = data.groupby('uid3').agg({'V67':['mean', 'std'],
                                'V83':['mean', 'std'],
                                'D4':['mean', 'std'],})                               
tmp.columns = pd.Index(['uid3_'+e[0]+'_'+e[1] for e in tmp.columns.tolist()])
tmp = tmp.reset_index()
data = data.merge(tmp, on='uid3', how='left')
del tmp
gc.collect()

data['addr'] = data['addr1'].astype(str) + '_' + data['addr2'].astype(str)
         
for col in ['uid', 'uid2', 'uid3', 'uid4', 'uid5', 'uid6', 'card4', 'card6', 'ProductCD', 'M4', 'addr']:
    fq_encode = data[col].value_counts().to_dict()   
    data[col+'_fq_enc'] = data[col].map(fq_encode)
    del data[col]
    gc.collect()
   
device_match_dict = {
    'sm':'samsung',
    'huawei':'huawei',
    'moto':'moto',
    'rv':'rv:',
    'trident':'trident',
    'lg':'lg-',
    'htc':'htc',
    'blade':'blade',
    'windows':'windows',
    'lenovo':'lenovo',
    'linux':'linux',
    'f3':'f3',
    'f5':'f5',
    'unknown_info': 'unknown_info',
}

data['DeviceInfo'] = data['DeviceInfo'].apply(lambda x: str(x).lower())
for dev_type_s, dev_type_o in device_match_dict.items():
    data['DeviceInfo'] = data['DeviceInfo'].apply(lambda x: dev_type_s if dev_type_o in x else x)
data['DeviceInfo'] = data['DeviceInfo'].apply(lambda x: 'other_d_type' if x not in device_match_dict else x)

print('work on categorical features')
# the columns with numerical value
le = LabelEncoder()
for col in data.columns:
    if col not in ['TransactionID', 'isFraud'] and data[col].dtype == 'O':
        data[col] = le.fit_transform(data[col].apply(str))  #因为存在missing的原因
    
tmp = data.groupby('R_emaildomain').agg({'V118':['mean', 'std'],
                                          'V119':['mean', 'std'],})
tmp.columns = pd.Index(['R_emaildomain_'+e[0]+'_'+e[1] for e in tmp.columns.tolist()])
tmp = tmp.reset_index()
data = data.merge(tmp, on='R_emaildomain', how='left')
del tmp
gc.collect()

tmp = data.groupby('card1').agg({'V20':['mean', 'std'],
                                 'V151': ['mean', 'std'],
                                 'V67': ['mean', 'std'],})
                                 
tmp.columns = pd.Index(['card1_'+e[0]+'_'+e[1] for e in tmp.columns.tolist()])
tmp = tmp.reset_index()
data = data.merge(tmp, on='card1', how='left')
del tmp
gc.collect()

tmp = data.groupby('hour').agg({'V116':['mean', 'std'],})
                                 
tmp.columns = pd.Index(['hour_'+e[0]+'_'+e[1] for e in tmp.columns.tolist()])
tmp = tmp.reset_index()
data = data.merge(tmp, on='hour', how='left')
del tmp
gc.collect()

del data['id_31'], data['id_19'], data['id_20'], data['V316'], data['id_12'],data['V306'], data['V220'], data['id_04'], data['id_05']
gc.collect()
    
feats = [col for col in data.columns if col not in ['TransactionID', 'isFraud']]

#feats = [col for col in feats if col not in ['month_sin', 'month_cos']]

train = data[:train_shape]
test = data[train_shape:]

train = train[feats]
test = test[feats]

work on TransactionDT
fill out D2 information
work on categorical features


In [5]:
print('done')

done


In [7]:
pd.set_option('max_columns', 500)

In [None]:
tr011 = pd.read_parquet('../../data/train_FE011.parquet')

In [21]:
te011 = pd.read_parquet('../../data/test_FE011.parquet')

In [16]:
len([c for c in train.columns if c not in tr011.columns])

77

In [19]:
tr011.shape

(590540, 1482)

In [22]:
tr012 = pd.concat([tr011, train[[c for c in train.columns if c not in tr011.columns]]], axis=1)
te012 = pd.concat([te011, test[[c for c in test.columns if c not in te011.columns]]], axis=1)

In [23]:
tr012.shape

(590540, 1559)

In [24]:
te012.shape

(506691, 1559)

In [27]:
tr012.to_parquet('../../data/train_FE012.parquet')
te012.to_parquet('../../data/test_FE012.parquet')

In [28]:
[c for c in train.columns if c not in tr011.columns]

['hour',
 'hour_sin',
 'week',
 'week_sin',
 'week_cos',
 'month',
 'life_of_customer',
 'addr1_broad_area',
 'uid6_TransactionAmt_mean',
 'uid6_TransactionAmt_std',
 'hour_TransactionAmt_mean',
 'hour_TransactionAmt_std',
 'week_TransactionAmt_mean',
 'week_TransactionAmt_std',
 'D1_diff',
 'D10_diff',
 'D15_diff',
 'new_identity_M5_mean',
 'new_identity_M6_mean',
 'new_identity_V315_mean',
 'new_identity_D1_diff_mean',
 'new_identity_D3_mean',
 'new_identity_D10_diff_mean',
 'new_identity_D15_diff_mean',
 'addr1_addr2_new_identity_M5_mean_mean',
 'addr1_addr2_new_identity_M5_mean_std',
 'addr1_addr2_new_identity_M6_mean_mean',
 'addr1_addr2_new_identity_M6_mean_std',
 'addr1_addr2_new_identity_V315_mean_mean',
 'addr1_addr2_new_identity_V315_mean_std',
 'addr1_addr2_new_identity_D1_diff_mean_mean',
 'addr1_addr2_new_identity_D1_diff_mean_std',
 'addr1_addr2_new_identity_D10_diff_mean_mean',
 'addr1_addr2_new_identity_D10_diff_mean_std',
 'addr1_addr2_new_identity_D15_diff_mean_mean',