In [68]:
import pandas as pd
import os
import sys
from pathlib import Path
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

project_dir = Path('__main__').resolve().parents[1]

In [2]:
df = pd.read_csv(os.path.join(project_dir, 'data', 'raw', 'fraudTest.csv')).sample(frac=1, random_state=777)
df2 = pd.read_csv(os.path.join(project_dir, 'data', 'raw', 'fraudTrain.csv')).sample(frac=1, random_state=777)

In [3]:
df = df[(df.is_fraud == 1) | (df.index < 200000)].reset_index(drop=True)
df2 = df2[(df2.is_fraud == 1) | (df2.index < 200000)].reset_index(drop=True)

df = pd.concat([df, df2], ignore_index=True)

In [4]:
del df2

In [5]:
print(df.shape, df.is_fraud.mean())
df.head()

(407144, 23) 0.023704143988367753


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,136357,2020-08-08 06:54:26,340953839692349,fraud_Bradtke PLC,grocery_pos,61.25,Tyler,Wright,M,615 Clarke Spring Apt. 172,...,42.5164,-82.9832,134056,"Doctor, hospital",1980-05-18,08a4b899f036f48370b9a2cb72b27d48,1375944866,43.145584,-82.091186,0
1,123933,2020-08-03 15:42:24,3541687240161491,"fraud_Durgan, Gislason and Spencer",home,164.89,Mark,Nguyen,M,62631 Ashley Ramp Apt. 965,...,31.1989,-81.3322,298,"Engineer, structural",1963-04-22,bc5fbe9a24420708cc99fd0e5c540d5f,1375544544,30.662379,-81.759713,0
2,109928,2020-07-29 02:57:43,3560797065840735,fraud_Kling-Grant,grocery_net,45.02,Janet,Turner,F,0925 Lang Extensions,...,46.1838,-101.2589,77,Film/video editor,1989-12-17,d5e631b2f3792fa0e7df55819857a73f,1375066663,45.201678,-102.187911,0
3,78039,2020-07-18 19:47:57,4254074738931278,fraud_Bahringer Group,health_fitness,15.97,Gary,Hall,M,69085 Short Shores,...,42.0737,-78.0594,239,Radio producer,1956-05-02,f7993c8a9be13184c75ec7a1f767366e,1374176877,41.216391,-77.079494,0
4,50100,2020-07-07 23:56:45,3538520143479972,fraud_Berge-Ullrich,home,20.58,Cassandra,Nunez,F,9572 Austin Forge Suite 612,...,41.5686,-83.3632,269,Insurance underwriter,1965-09-15,1bc0592b9b49bb76475b9a28dd668b35,1373241405,40.945044,-83.756485,0


In [6]:
df['ano_mes_data'] = df['trans_date_trans_time'].astype('datetime64[ns]')

In [7]:
df['ano_mes'] = df['trans_date_trans_time'].str[0:7]

# Window Features

In [8]:
def p99(x):
    return x.quantile(0.99)

def p90(x):
    return x.quantile(0.9)

def p75(x):
    return x.quantile(0.75)

def p01(x):
    return x.quantile(0.01)

def p10(x):
    return x.quantile(0.1)

def p25(x):
    return x.quantile(0.25)

In [9]:
df_top_mcc = df.groupby(['cc_num', 'ano_mes','category']).agg({'amt' : ['count', 'sum', 'max', 'min', p99, p90, p75, 'median', p25, p10, p01]})

In [15]:
df_top_mcc = df_top_mcc.reset_index()

In [33]:
df_top_mcc.columns = ['cc_num', 'ano_mes', 'category', 'count_mcc', 'sum_mcc', 'max_mcc', 'min_mcc', 'p99_mcc', 'p90_mcc', 'p75_mcc', 'median_mcc', 'p25_mcc', 'p10_mcc', 'p01_mcc']

In [10]:
df_top3_mcc = (df.groupby(['cc_num', 'ano_mes', 'category'])
               .size().reset_index(name='count')
               .sort_values(['cc_num', 'ano_mes', 'count'], ascending=[True, False, False])
               .groupby(['cc_num', 'ano_mes']).head(3).reset_index(drop=True))

In [11]:
df_top3_mcc['row_number'] = df_top3_mcc.groupby('cc_num').cumcount() + 1

In [27]:
df_top3_mcc.columns = ['cc_num', 'ano_mes', 'category', 'count', 'row_number']

In [48]:
df_cpf = df.groupby(['cc_num', 'ano_mes']).agg({'amt' : ['count', 'sum', 'max', 'min', p99, p90, p75, 'median', p25, p10, p01]})

In [49]:
df_cpf = df_cpf.reset_index()

In [50]:
df_cpf.columns = ['cc_num', 'ano_mes', 'count_cpf', 'sum_cpf', 'max_cpf', 'min_cpf', 'p99_cpf', 'p90_cpf', 'p75_cpf', 'median_cpf', 'p25_cpf', 'p10_cpf', 'p01_cpf']

In [18]:
df_abt = df[df.ano_mes_data > pd.to_datetime('2020-01-01')].groupby(['cc_num', 'ano_mes']).agg({'is_fraud' : 'max'}).reset_index()

# Merge Feature Data

In [51]:
df_cpf = df_cpf.merge(df_top3_mcc[df_top3_mcc.row_number == 1][['cc_num', 'category', 'ano_mes']], how='left', on=['cc_num', 'ano_mes'])

In [52]:
df_cpf = df_cpf.merge(df_top3_mcc[df_top3_mcc.row_number == 2][['cc_num', 'category', 'ano_mes']], how='left', on=['cc_num', 'ano_mes'], suffixes=('', 'top2'))

In [53]:
df_cpf = df_cpf.merge(df_top3_mcc[df_top3_mcc.row_number == 3][['cc_num', 'category', 'ano_mes']], how='left', on=['cc_num', 'ano_mes'], suffixes=('top1', 'top3'))

In [54]:
df_cpf = df_cpf.rename(columns={'categorytop1' : 'category'}).merge(df_top_mcc, how='left', on=['cc_num', 'ano_mes', 'category']).rename(columns={'category' : 'categorytop1'})

In [55]:
df_cpf = df_cpf.rename(columns={'categorytop2' : 'category'}).merge(df_top_mcc, how='left', on=['cc_num', 'ano_mes', 'category'], suffixes=('', '_top2')).rename(columns={'category' : 'categorytop2'})

In [56]:
df_cpf = df_cpf.rename(columns={'categorytop3' : 'category'}).merge(df_top_mcc, how='left', on=['cc_num', 'ano_mes', 'category'], suffixes=('_top1', '_top3')).rename(columns={'category' : 'categorytop3'})

In [58]:
df_cpf

Unnamed: 0,cc_num,ano_mes,count_cpf,sum_cpf,max_cpf,min_cpf,p99_cpf,p90_cpf,p75_cpf,median_cpf,...,sum_mcc_top3,max_mcc_top3,min_mcc_top3,p99_mcc_top3,p90_mcc_top3,p75_mcc_top3,median_mcc_top3,p25_mcc_top3,p10_mcc_top3,p01_mcc_top3
0,60416207185,2019-01,56,2410.54,204.15,1.84,202.2910,104.935,64.4900,21.685,...,,,,,,,,,,
1,60416207185,2019-02,59,2861.48,224.75,1.24,210.8764,118.436,74.6100,29.220,...,,,,,,,,,,
2,60416207185,2019-03,86,6672.74,852.81,1.07,833.4725,133.555,82.4025,48.865,...,,,,,,,,,,
3,60416207185,2019-04,39,2353.19,472.17,1.43,397.4544,113.800,69.6000,27.550,...,,,,,,,,,,
4,60416207185,2020-06,34,2027.02,148.02,2.83,146.2314,111.994,87.5325,64.185,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7075,4992346398065154184,2019-10,4,3037.20,961.40,340.98,960.5156,952.556,939.2900,867.410,...,,,,,,,,,,
7076,4992346398065154184,2019-11,4,3415.32,1091.87,302.91,1090.6403,1079.573,1061.1275,1010.270,...,,,,,,,,,,
7077,4992346398065154184,2020-06,44,2731.30,390.20,1.45,338.9440,143.360,71.9650,47.880,...,,,,,,,,,,
7078,4992346398065154184,2020-07,121,7084.52,291.15,1.58,196.0780,127.720,87.8800,48.700,...,,,,,,,,,,


In [61]:
df_cpf.to_parquet(os.path.join(project_dir, 'data', 'interim', 'processed.parquet.gzip'), compression='gzip', index=False)

In [62]:
df_abt.to_parquet(os.path.join(project_dir, 'data', 'interim', 'abt.parquet.gzip'), compression='gzip', index=False)

# Normalize

In [69]:
scaler = MinMaxScaler()