# Imports

In [26]:
import sys
project_dir = 'C:\\Users\\diego\\OneDrive\\Cursos\\Python\\learning_curve'
sys.path.append(project_dir)
import os

from sklearn.base import BaseEstimator, TransformerMixin
from feature_engine.encoding import OrdinalEncoder
import pandas as pd
from datetime import datetime
import numpy as np
import yaml

# Data

In [3]:
teste = pd.read_csv(os.path.join(project_dir,'data/fraudTest.csv'), index_col='Unnamed: 0').reset_index(drop=True)
treino = pd.read_csv(os.path.join(project_dir,'data/fraudTrain.csv'), index_col='Unnamed: 0').reset_index(drop=True)

In [4]:
teste['train_t_split'] = 'TESTE'
treino['train_t_split'] = 'TREINO'

In [5]:
df = pd.concat([teste, treino], ignore_index=True).reset_index(drop=True)

In [6]:
df.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud', 'train_t_split'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 23 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   trans_date_trans_time  object 
 1   cc_num                 int64  
 2   merchant               object 
 3   category               object 
 4   amt                    float64
 5   first                  object 
 6   last                   object 
 7   gender                 object 
 8   street                 object 
 9   city                   object 
 10  state                  object 
 11  zip                    int64  
 12  lat                    float64
 13  long                   float64
 14  city_pop               int64  
 15  job                    object 
 16  dob                    object 
 17  trans_num              object 
 18  unix_time              int64  
 19  merch_lat              float64
 20  merch_long             float64
 21  is_fraud               int64  
 22  train_t_split     

In [8]:
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,train_t_split
0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0,TESTE
1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0,TESTE
2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0,TESTE
3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0,TESTE
4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0,TESTE


# Simple Prep

In [9]:
class TypesCorrector(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        
        return self
    
    def transform(self, X):
        
        X['trans_date_trans_time'] = pd.to_datetime(X['trans_date_trans_time'])
        
        X['dob'] = pd.to_datetime(X['dob'])
        
        return X

In [10]:
def time_of_day(x):
    if x >= 23 and x < 7:
        return 'Dawn'
    elif x >= 7 and x < 12:
        return 'Morning'
    elif x >= 12 and x < 18:
        return 'Afternoon'
    return 'Evening'

In [11]:
class FeatureCreator(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        
        
    def fit(self, X, y=None):
        
        return self
    
    def transform(self, X):
        
        #anomesdia
        X['anomesdia'] = X['trans_date_trans_time'].apply(lambda x: datetime.strftime(x, '%Y%m%d')).astype(int)
        
        #anomes
        X['anomes'] = X['trans_date_trans_time'].apply(lambda x: datetime.strftime(x, '%Y%m')).astype(int)
        
        #Day of week
        X['day_of_week'] = X['trans_date_trans_time'].dt.dayofweek
        
        #Time of day
        X['time_of_day'] = X['trans_date_trans_time'].apply(lambda x: time_of_day(x.hour))
        
        #Client age
        X['client_age'] = np.floor(((X['trans_date_trans_time'] - X['dob']).dt.days/365))
        
        
        X = X.sort_values(by=['cc_num', 'trans_date_trans_time'], ascending=True).reset_index(drop=True)
        
        aux = X.set_index('trans_date_trans_time')
        
        window_size = pd.Timedelta(minutes=30)
        
        X['sum_last_30_minutes'] = aux.groupby('cc_num')['amt'].rolling(window=window_size).sum().reset_index(drop=True)
        
        window_size = pd.Timedelta(hours=1)
        
        X['sum_last_1_hour'] = aux.groupby('cc_num')['amt'].rolling(window=window_size).sum().reset_index(drop=True)
        
        window_size = pd.Timedelta(hours=2)
        
        X['sum_last_2_hour'] = aux.groupby('cc_num')['amt'].rolling(window=window_size).sum().reset_index(drop=True)
        
        window_size = pd.Timedelta(hours=8)
        
        X['sum_last_8_hour'] = aux.groupby('cc_num')['amt'].rolling(window=window_size).sum().reset_index(drop=True)
        
        window_size = pd.Timedelta(hours=12)
        
        X['sum_last_12_hour'] = aux.groupby('cc_num')['amt'].rolling(window=window_size).sum().reset_index(drop=True)
        
        window_size = pd.Timedelta(days=1)
        
        X['sum_last_24_hour'] = aux.groupby('cc_num')['amt'].rolling(window=window_size).sum().reset_index(drop=True)
        
        window_size = pd.Timedelta(days=3)
        
        X['sum_last_72_hour'] = aux.groupby('cc_num')['amt'].rolling(window=window_size).sum().reset_index(drop=True)
        
        return X
        
    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)

In [12]:
type_corr = TypesCorrector()

In [13]:
df = type_corr.transform(df)

In [14]:
feat_creator = FeatureCreator()

In [15]:
%%time
df = feat_creator.transform(df)

CPU times: total: 56.1 s
Wall time: 1min 7s


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 35 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   trans_date_trans_time  datetime64[ns]
 1   cc_num                 int64         
 2   merchant               object        
 3   category               object        
 4   amt                    float64       
 5   first                  object        
 6   last                   object        
 7   gender                 object        
 8   street                 object        
 9   city                   object        
 10  state                  object        
 11  zip                    int64         
 12  lat                    float64       
 13  long                   float64       
 14  city_pop               int64         
 15  job                    object        
 16  dob                    datetime64[ns]
 17  trans_num              object        
 18  unix_time             

In [17]:
df.head(10)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,train_t_split,anomesdia,anomes,day_of_week,time_of_day,client_age,sum_last_30_minutes,sum_last_1_hour,sum_last_2_hour,sum_last_8_hour,sum_last_12_hour,sum_last_24_hour,sum_last_72_hour
0,2019-01-01 12:47:15,60416207185,"fraud_Jones, Sawayn and Romaguera",misc_net,7.27,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,1986-02-17,98e3dcf98101146a577f85a34e58feec,1325422035,43.974711,-109.741904,0,TREINO,20190101,201901,1,Afternoon,32.0,7.27,7.27,7.27,7.27,7.27,7.27,7.27
1,2019-01-02 08:44:57,60416207185,fraud_Berge LLC,gas_transport,52.94,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,1986-02-17,498120fc45d277f7c88e3dba79c33865,1325493897,42.018766,-109.044172,0,TREINO,20190102,201901,2,Morning,32.0,52.94,52.94,52.94,52.94,52.94,60.21,60.21
2,2019-01-02 08:47:36,60416207185,fraud_Luettgen PLC,gas_transport,82.08,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,1986-02-17,95f514bb993151347c7acdf8505c3d62,1325494056,42.961335,-109.157564,0,TREINO,20190102,201901,2,Morning,32.0,135.02,135.02,135.02,135.02,135.02,142.29,142.29
3,2019-01-02 12:38:14,60416207185,fraud_Daugherty LLC,kids_pets,34.79,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,1986-02-17,4f0c1a14e0aa7eb56a490780ef9268c5,1325507894,42.228227,-108.747683,0,TREINO,20190102,201901,2,Afternoon,32.0,34.79,34.79,34.79,169.81,169.81,177.08,177.08
4,2019-01-02 13:10:46,60416207185,fraud_Beier and Sons,home,27.18,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,1986-02-17,3b2ebd3af508afba959640893e1e82bc,1325509846,43.321745,-108.091143,0,TREINO,20190102,201901,2,Afternoon,32.0,27.18,61.97,61.97,196.99,196.99,196.99,204.26
5,2019-01-03 13:56:35,60416207185,fraud_Stamm-Witting,shopping_net,6.87,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,1986-02-17,c2c69214de58aaf2bad1542b91751f5a,1325598995,43.477317,-109.467136,0,TREINO,20190103,201901,3,Afternoon,32.0,6.87,6.87,6.87,6.87,6.87,6.87,211.13
6,2019-01-03 17:05:10,60416207185,fraud_Conroy-Emard,food_dining,8.43,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,1986-02-17,da5ef053fa971418ab30fc72509c66f8,1325610310,42.871477,-109.160268,0,TREINO,20190103,201901,3,Afternoon,32.0,8.43,8.43,8.43,15.3,15.3,15.3,219.56
7,2019-01-04 13:59:55,60416207185,fraud_Pollich LLC,home,117.11,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,1986-02-17,79a66e4565825e428e2e098ffd6b9969,1325685595,43.332599,-108.318444,0,TREINO,20190104,201901,4,Afternoon,32.0,117.11,117.11,117.11,117.11,117.11,125.54,329.4
8,2019-01-04 21:17:22,60416207185,fraud_Monahan-Morar,personal_care,26.74,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,1986-02-17,6dbbe7c58049b9b5cf6c0a29793468b0,1325711842,43.598123,-108.977767,0,TREINO,20190104,201901,4,Evening,32.0,26.74,26.74,26.74,143.85,143.85,143.85,356.14
9,2019-01-05 00:42:24,60416207185,fraud_Vandervort-Funk,grocery_pos,105.2,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,1986-02-17,38dcb937f9e60e26fca7e7f3b3437e07,1325724144,42.314401,-108.55452,0,TREINO,20190105,201901,5,Evening,32.0,105.2,105.2,105.2,131.94,249.05,249.05,461.34


In [17]:
df.is_fraud.mean()

0.005210014716091717

# Encoding

In [27]:
features = yaml.safe_load(open(os.path.join(project_dir, 'src', 'feature', 'config', 'variaveis.yaml'), 'rb'))

In [28]:
colunas_enc = list(set(df.select_dtypes(include=['object', 'string']).columns.tolist()) & set(features['descritivas']))

In [32]:
ordEnc = OrdinalEncoder(variables = colunas_enc, encoding_method='arbitrary')

In [33]:
df = ordEnc.fit_transform(df)

In [34]:
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,train_t_split,anomesdia,anomes,day_of_week,time_of_day,client_age,sum_last_30_minutes,sum_last_1_hour,sum_last_2_hour,sum_last_8_hour,sum_last_12_hour,sum_last_24_hour,sum_last_72_hour
0,2019-01-01 12:47:15,60416207185,"fraud_Jones, Sawayn and Romaguera",0,7.27,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,0,82514,43.0048,-108.8964,1645,0,1986-02-17,98e3dcf98101146a577f85a34e58feec,1325422035,43.974711,-109.741904,0,TREINO,20190101,201901,1,0,32.0,7.27,7.27,7.27,7.27,7.27,7.27,7.27
1,2019-01-02 08:44:57,60416207185,fraud_Berge LLC,1,52.94,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,0,82514,43.0048,-108.8964,1645,0,1986-02-17,498120fc45d277f7c88e3dba79c33865,1325493897,42.018766,-109.044172,0,TREINO,20190102,201901,2,1,32.0,52.94,52.94,52.94,52.94,52.94,60.21,60.21
2,2019-01-02 08:47:36,60416207185,fraud_Luettgen PLC,1,82.08,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,0,82514,43.0048,-108.8964,1645,0,1986-02-17,95f514bb993151347c7acdf8505c3d62,1325494056,42.961335,-109.157564,0,TREINO,20190102,201901,2,1,32.0,135.02,135.02,135.02,135.02,135.02,142.29,142.29
3,2019-01-02 12:38:14,60416207185,fraud_Daugherty LLC,2,34.79,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,0,82514,43.0048,-108.8964,1645,0,1986-02-17,4f0c1a14e0aa7eb56a490780ef9268c5,1325507894,42.228227,-108.747683,0,TREINO,20190102,201901,2,0,32.0,34.79,34.79,34.79,169.81,169.81,177.08,177.08
4,2019-01-02 13:10:46,60416207185,fraud_Beier and Sons,3,27.18,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,0,82514,43.0048,-108.8964,1645,0,1986-02-17,3b2ebd3af508afba959640893e1e82bc,1325509846,43.321745,-108.091143,0,TREINO,20190102,201901,2,0,32.0,27.18,61.97,61.97,196.99,196.99,196.99,204.26


In [35]:
df[df.train_t_split == 'TREINO'].to_parquet(os.path.join(project_dir, 'data', 'feat', 'treino.parquet.gzip'), compression='gzip', index=False)

In [36]:
df[df.train_t_split == 'TESTE'].to_parquet(os.path.join(project_dir, 'data', 'feat', 'teste.parquet.gzip'), compression='gzip', index=False)