In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.stats import skew, kurtosis
import glob
import os

### Load data and do some preprocessing

In [2]:
custinfo = pd.read_csv('E:/Datasets/Fintech/TrainingDataset_first/public_train_x_custinfo_full_hashed.csv')
ccba = pd.read_csv('E:/Datasets/Fintech/TrainingDataset_first/public_train_x_ccba_full_hashed.csv')

cdtx = pd.read_csv('E:/Datasets/Fintech/TrainingDataset_first/public_train_x_cdtx0001_full_hashed.csv')

dp = pd.read_csv('E:/Datasets/Fintech/TrainingDataset_first/public_train_x_dp_full_hashed.csv')
dp.tx_amt.fillna(0, inplace=True)
dp['amt_ntd'] = dp['tx_amt']*dp['exchg_rate']
dp.rename(columns={'tx_date': 'date'}, inplace=True)
dp['txbranch'] = [0 if pd.isna(x) else 1 for x in dp.txbranch.values]

remit = pd.read_csv('E:/Datasets/Fintech/TrainingDataset_first/public_train_x_remit1_full_hashed.csv')
remit.rename(columns={'trans_date': 'date'}, inplace=True)

alert_pub = pd.read_csv('E:/Datasets/Fintech/TrainingDataset_first/public_x_alert_date.csv')
alert_train = pd.read_csv('E:/Datasets/Fintech/TrainingDataset_first/train_x_alert_date.csv')
alert =pd.concat([alert_pub, alert_train])

train_label = pd.read_csv('E:/Datasets/Fintech/TrainingDataset_first/train_y_answer.csv')
pub_label = pd.read_csv('E:/Datasets/Fintech/TrainingDataset_first/24_ESun_public_y_answer.csv')
label = pd.concat([train_label, pub_label])

# combine background infos with alert key infos
months = ccba.byymm.unique()
def ak_month(x):
    """define which month alert key belongs to"""
    return months[months<=x][-1]
temp = pd.merge(alert, label, on="alert_key")
alert_key_info = pd.merge(custinfo, temp, on="alert_key")
alert_key_info = alert_key_info.sort_values(by=['date'])
alert_key_info['alert_month'] = [ak_month(x) for x in alert_key_info.date.values]
alert_key_info.to_csv('./alert_key_info.csv', index=False)

alert_key_info = pd.read_csv('./alert_key_info.csv')

In [3]:
alert_key_info

Unnamed: 0,alert_key,cust_id,risk_rank,occupation_code,total_asset,AGE,date,sar_flag,alert_month
0,171494,7a5bb395f798e329689984d41e3e4848ace9c24c52f380...,1,4.0,34467.0,4,0,0,0
1,171209,95836236dac30345eaf79b450af364b869724d46adc7cd...,3,12.0,1195038.0,3,0,0,0
2,171320,98e0495b6af0455e2d2e0492a6988b3719857d505028c3...,1,12.0,1038.0,6,0,0,0
3,171324,a7c9713806c471d644cd8216d4a943be6f4048e0d8a59e...,3,19.0,14204286.0,9,0,0,0
4,171357,e19835a949c31f3c41584668beeaf1159cd15a2f74c352...,1,19.0,104.0,5,0,0,0
...,...,...,...,...,...,...,...,...,...
25746,364624,12da1653493fda99b09d5ca125fbb1d58fa514da8d8f8b...,1,19.0,0.0,5,393,0,365
25747,364612,b0a578468b54394a9e61f2adb6abbe41fff7e7efb0898f...,1,16.0,262895.0,4,393,0,365
25748,364610,92ce2f8b5af6ad7a6da43a416c801c2548784aeaa48aed...,1,12.0,29867.0,4,393,0,365
25749,364666,e01690825649486bf76cfb69ff32c972eee57e072c2331...,1,9.0,0.0,3,393,0,365


In [4]:
eps = 1e-20

def dp_trans_type(x):
    """define trans type based on cross_bank, txbranch, ATM columns"""
    if np.array_equal(x, [0,0,1]):
        return 0 # withdraw
    elif np.array_equal(x, [0,0,0]):
        return 1 # online pay
    elif np.array_equal(x, [0,1,1]):
        return 2 # intra-bank trans
    elif np.array_equal(x, [1,1,1]):
        return 3 # inter-bank trans

def dp_tx_type(x):
    """define if it is '臨櫃現金交易' """
    return np.array_equal(x, [1,12])+0

def count_likely_trans(trans, thres=0.05): 
    """return the max count of transactions with difference smaller than threshold ratio"""
    max_cnt = 0
    trans = (trans//100)*100
    unq_trans, unq_cnt = np.unique(trans, return_counts=True)
    for i in range(len(unq_trans)):
        base = unq_trans[i]
        r = np.abs((unq_trans / base)-1)
        cnt = sum(unq_cnt[r<=thres])-1
        if cnt > max_cnt:
            max_cnt = cnt
    return max_cnt

### Define cdtx features

In [5]:
cdtx_keys = ['cdtx_n_country', 'cdtx_n_country_switch', 'cdtx_max_country', 'cdtx_n_foreign', 'cdtx_n_foreign_switch', 'cdtx_n_cur', 'cdtx_n_cur_switch', 'cdtx_n_forcur',\
             'cdtx_n_forcur_switch', 'cdtx_tx_sum', 'cdtx_likely_trans_05', 'cdtx_likely_trans_10', 'cdtx_tx_num']
def feat_cdtx(start, end, data_c):
    data = data_c[(data_c.date > start) & (data_c.date <= end)]
    if len(data)==0:
        return {k: 0 for k in cdtx_keys}
    data['binary_country'] = (data.country==130) + 0
    data['binary_currency'] = (data.cur_type==47) + 0
    cdtx_feat = dict()

    # transaction country
    cdtx_feat['cdtx_n_country'] = len(set(data.country))
    cdtx_feat['cdtx_n_country_switch'] = sum(data.country.diff()!=0)
    cdtx_feat['cdtx_max_country'] = data.country.value_counts().max()
    cdtx_feat['cdtx_n_foreign'] = sum(data.binary_country==0)
    cdtx_feat['cdtx_n_foreign_switch'] = sum(data.binary_country.diff()!=0)
    # transaction currency
    cdtx_feat['cdtx_n_cur'] = len(set(data.cur_type))
    cdtx_feat['cdtx_n_cur_switch'] = sum(data.cur_type.diff()!=0)
    cdtx_feat['cdtx_n_forcur'] = sum(data.binary_currency==0)
    cdtx_feat['cdtx_n_forcur_switch'] = sum(data.binary_currency.diff()!=0)
    # transaction amount
    cdtx_feat['cdtx_tx_sum'] = np.log(data.amt.sum()+eps)
    cdtx_feat['cdtx_likely_trans_05'] = count_likely_trans(data.amt.values, 0.05)
    cdtx_feat['cdtx_likely_trans_10'] = count_likely_trans(data.amt.values, 0.10)
    # num transaction
    cdtx_feat['cdtx_tx_num'] = len(data)

    return cdtx_feat

### Define dp features

In [6]:
dp_keys = ['dp_trans0', 'dp_trans1', 'dp_trans2', 'dp_trans3', 'dp_tx0', 'dp_tx1', 'dp_CR_sum', 'dp_CR_likely_trans_05', 'dp_CR_likely_trans_10', 'dp_DB_sum', 'dp_DB_likely_trans_05', 'dp_DB_likely_trans_10', \
           'dp_tx_sum', 'dp_tx_likely_trans_05', 'dp_tx_likely_trans_10', 'dp_neg_sum', 'dp_n_CR', 'dp_n_DB', 'dp_n_tx', 'dp_n_neg']
def feat_dp(start, end, data_c):
    data = data_c[(data_c.date > start) & (data_c.date <= end)]
    if len(data)==0:
        return {k: 0 for k in dp_keys}
    trans_types = np.array([dp_trans_type(x) for x in dp[['cross_bank', 'txbranch', 'ATM']]])
    tx_types = np.array([dp_tx_type(x) for x in dp[['tx_type', 'info_asset_code']]])
    dp_feat = dict()

    # transaction type
    dp_feat['dp_trans0'] = sum(trans_types==0)
    dp_feat['dp_trans1'] = sum(trans_types==1)
    dp_feat['dp_trans2'] = sum(trans_types==2)
    dp_feat['dp_trans3'] = sum(trans_types==3)
    # tx type
    dp_feat['dp_tx0'] = sum(tx_types==0)
    dp_feat['dp_tx1'] = sum(tx_types==1)

    # transaction amount
    data_pos = data[data.amt_ntd>=0]
    data_neg = data[data.amt_ntd<0]
    dp_feat['dp_CR_sum'] = np.log(data_pos[data_pos.debit_credit=='CR'].amt_ntd.sum()+eps)
    dp_feat['dp_CR_likely_trans_05'] = count_likely_trans(data_pos[data_pos.debit_credit=='CR'].amt_ntd.values, 0.05)
    dp_feat['dp_CR_likely_trans_10'] = count_likely_trans(data_pos[data_pos.debit_credit=='CR'].amt_ntd.values, 0.10)
    dp_feat['dp_DB_sum'] = np.log(data_pos[data_pos.debit_credit=='DB'].amt_ntd.sum()+eps)
    dp_feat['dp_DB_likely_trans_05'] = count_likely_trans(data_pos[data_pos.debit_credit=='DB'].amt_ntd.values, 0.05)
    dp_feat['dp_DB_likely_trans_10'] = count_likely_trans(data_pos[data_pos.debit_credit=='DB'].amt_ntd.values, 0.10)
    dp_feat['dp_tx_sum'] = np.log(data_pos.amt_ntd.sum()+eps)
    dp_feat['dp_tx_likely_trans_05'] = count_likely_trans(data_pos.amt_ntd.values, 0.05)
    dp_feat['dp_tx_likely_trans_10'] = count_likely_trans(data_pos.amt_ntd.values, 0.10)
    dp_feat['dp_neg_sum'] = np.log(np.abs(data_neg.amt_ntd.sum())+eps)
    # num transaction
    dp_feat['dp_n_CR'] = len(data_pos[data_pos.debit_credit=='CR'])
    dp_feat['dp_n_DB'] = len(data_pos[data_pos.debit_credit=='DB'])
    dp_feat['dp_n_tx'] = len(data_pos)
    dp_feat['dp_n_neg'] = len(data_neg)
    return dp_feat

### Define remit features

In [7]:
remit_keys = ['remit_n_trans_type', 'remit_trans0', 'remit_trans1', 'remit_trans2', 'remit_trans3', 'remit_trans4', 'remit_tx_sum', 'remit_tx_likely_trans05', 'remit_tx_likely_trans10', 'remit_tx_num']
def feat_remit(start, end, data_c):
    data = data_c[(data_c.date > start) & (data_c.date <= end)]
    if len(data)==0:
        return {k: 0 for k in remit_keys}

    remit_feat = dict()

    # trans type
    remit_feat['remit_n_trans_type'] = len(set(data.trans_no))
    remit_feat['remit_trans0'] = len(data[data.trans_no==0])
    remit_feat['remit_trans1'] = len(data[data.trans_no==1])
    remit_feat['remit_trans2'] = len(data[data.trans_no==2])
    remit_feat['remit_trans3'] = len(data[data.trans_no==3])
    remit_feat['remit_trans4'] = len(data[data.trans_no==4])
    # transaction amt
    remit_feat['remit_tx_sum'] = np.log(remit.trade_amount_usd.sum()+eps)
    remit_feat['remit_tx_likely_trans05'] = count_likely_trans(data.trade_amount_usd.values, 0.05)
    remit_feat['remit_tx_likely_trans10'] = count_likely_trans(data.trade_amount_usd.values, 0.10)
    # num transaction
    remit_feat['remit_tx_num'] = len(data)


    return remit_feat

### Define ccba and custinfo features

In [8]:
ccba_keys = ['lupay', 'cycam', 'usgam', 'clamt', 'csamt', 'inamt', 'cucsm', 'cucah']
def feat_ccba(ak_month, data_c):
    data = data_c[data_c.byymm==ak_month]
    if data.empty:
        return np.zeros(len(ccba_keys))
    return [np.log(x+eps) if x>=0 else -np.log(-x+eps) for x in data.loc[:,ccba_keys].values[0]]

def feat_custinfo(ak, data_c):
    return np.log(data_c[data_c.alert_key==ak].total_asset.item()+eps)

### Start to generate features

In [9]:
ak_group = alert_key_info.groupby('cust_id')
cdtx_group = cdtx.groupby('cust_id')
dp_group = dp.groupby('cust_id')
remit_group = remit.groupby('cust_id')
ccba_group = ccba.groupby('cust_id')

custs = custinfo.cust_id.unique()
all_data = []
check_shape = []
for cust in tqdm(custs):

    try: ak_info_c = ak_group.get_group(cust).reset_index()
    except: ak_info_c=pd.DataFrame([])
    try: cdtx_c = cdtx_group.get_group(cust)
    except: cdtx_c=pd.DataFrame([])
    try: dp_c = dp_group.get_group(cust)
    except: dp_c=pd.DataFrame([])
    try: remit_c = remit_group.get_group(cust)
    except: remit_c=pd.DataFrame([])
    try: ccba_c = ccba_group.get_group(cust)
    except: ccba_c=pd.DataFrame([])

    #for ak_d in ak_info_c.date.values:
    for idx, ak_row in ak_info_c.iterrows():
        start_date = 0 if idx==0 else ak_info_c.iloc[idx-1,:].date.item()
        end_date = ak_row.date
        
        period_cdtx_feat = list(feat_cdtx(start_date, end_date, cdtx_c).values()) if not cdtx_c.empty else np.zeros(len(cdtx_keys))
        period_dp_feat = list(feat_dp(start_date, end_date, dp_c).values()) if not dp_c.empty else np.zeros(len(dp_keys))
        period_remit_feat = list(feat_remit(start_date, end_date, remit_c).values()) if not remit_c.empty else np.zeros(len(remit_keys))

        day5_cdtx_feat = list(feat_cdtx(end_date-5, end_date, cdtx_c).values()) if not cdtx_c.empty else np.zeros(len(cdtx_keys))
        day5_dp_feat = list(feat_dp(end_date-5, end_date, dp_c).values()) if not dp_c.empty else np.zeros(len(dp_keys))
        day5_remit_feat = list(feat_remit(end_date-5, end_date, remit_c).values()) if not remit_c.empty else np.zeros(len(remit_keys))

        day10_cdtx_feat = list(feat_cdtx(end_date-10, end_date, cdtx_c).values()) if not cdtx_c.empty else np.zeros(len(cdtx_keys))
        day10_dp_feat = list(feat_dp(end_date-10, end_date, dp_c).values()) if not dp_c.empty else np.zeros(len(dp_keys))
        day10_remit_feat = list(feat_remit(end_date-10, end_date, remit_c).values()) if not remit_c.empty else np.zeros(len(remit_keys))

        ccba_feat = feat_ccba(ak_row.alert_month, ccba_c) if not ccba_c.empty else np.zeros(len(ccba_keys))
        custinfo_feat = feat_custinfo(ak_row.alert_key, ak_info_c)

        cust_data = np.hstack([period_cdtx_feat, period_dp_feat, period_remit_feat, day5_cdtx_feat, day5_dp_feat, day5_remit_feat, day10_cdtx_feat, day10_dp_feat, day10_remit_feat, ccba_feat, custinfo_feat, ak_row.alert_key, ak_row.sar_flag]).flatten()

        check_shape.append(cust_data.shape[0])
        assert cust_data.shape[0] == 140, "shape error"
        all_data.append(cust_data)
    

    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
100%|██████████| 7708/7708 [34:00<00:00,  3.78it/s]  


### Define feature names

In [10]:
feat_names = []
feat_names += ['period_'+n for n in cdtx_keys]
feat_names += ['period_'+n for n in dp_keys]
feat_names += ['period_'+n for n in remit_keys]
feat_names += ['day5_'+n for n in cdtx_keys]
feat_names += ['day5_'+n for n in dp_keys]
feat_names += ['day5_'+n for n in remit_keys]
feat_names += ['day10_'+n for n in cdtx_keys]
feat_names += ['day10_'+n for n in dp_keys]
feat_names += ['day10_'+n for n in remit_keys]
feat_names += ['ccba_'+n for n in ccba_keys]
feat_names += ['custinfo_total_asset']
feat_names += ['alert_key']
feat_names += ['sar_flag']

### Save as featuresV2.csv

In [11]:
output_data = pd.DataFrame(all_data, columns=feat_names)
output_data.drop_duplicates(inplace=True)
output_data.to_csv('./featuresV2.csv', index=False)