# Feature Engineering

In [6]:
import pandas as pd
from datetime import datetime

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Data

In [2]:
df = pd.read_parquet('../data/interim/fraud_dataset_v2_valid.parquet.gzip')

In [5]:
df.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,fecha,monto,score,fraude
0,4,0.7181,114261.67,35.0,0.0,1.0,BR,6,Phytopower Caps Original Compre 4 Pague 2 - Fr...,cat_d69bed9,0.786613,743.0,65.0,1,Y,N,0.7,114261,0,2020-03-29 19:40:35,23.88,39.0,0
1,4,0.7058,54006.36,1.0,0.282594,0.0,BR,34,Mini Modulo Amplificador 4 Canais + Fonte 12v ...,cat_4e2d9a5,0.02568,0.0,0.0,0,NULO,N,0.02,54006,0,2020-03-22 10:16:19,25.33,83.0,0
2,2,0.7535,43515.6,14.0,0.0,46.0,UY,36,Radio Auto Multimedia Bluetooth Mp3 Desmontabl...,cat_842c755,0.553464,2729.0,94.0,1,NULO,Y,0.28,260445,100,2020-03-09 21:54:45,39.0,66.0,0
3,4,0.8707,796.03,42.0,0.761702,0.0,BR,0,Cupinicida Aerosol 400ml Cupins De Madeira Kel...,cat_27dfebd,0.356607,187.0,184.0,0,NULO,N,0.82,796,0,2020-04-21 14:50:55,4.53,71.0,0
4,4,0.7535,43515.6,3.0,0.0,1.0,BR,14,Lanterna De Cabeça Profissional Led T6 15000w ...,cat_2579800,0.131092,189.0,189.0,1,NULO,Y,0.04,260445,100,2020-03-13 14:13:18,593.03,64.0,0


# Feature Engineering

In [10]:
def get_period_of_day(hour):
    if hour < 6:
        return 'dawn'
    if hour < 12:
        return 'morning'
    if hour < 18:
        return 'afternoon'
    if hour < 24:
        return 'evening'

class FeatureEngineering:
    """Class to create new features based on existing ones
    """
    def __init__(self, inference=True):
        self.inference = inference

    def fit(self, X, y=None):
        return self
    
    def create_train_features(self, X):
        X = X.reset_index(drop=True)
        X_tmp = X.reset_index(drop=True)

        X_tmp['ymd'] = X_tmp['fecha'].apply(lambda x: datetime.strftime(x, '%Y%m%d')).astype(int)

        new_columns = [col for col in X_tmp.columns if col not in X.columns]
        X_tmp = X.merge(X_tmp[new_columns], left_index=True, right_index=True, how='left')
        
        return X_tmp
    
    def create_payload_features(self, X):
        X = X.reset_index(drop=True)
        X_tmp = X.reset_index(drop=True)

        X_tmp['day_of_week'] = X_tmp['fecha'].dt.weekday

        X_tmp['hour_of_day'] = X_tmp['fecha'].dt.hour

        X_tmp['period_of_day'] = X_tmp['hour_of_day'].apply(get_period_of_day)

        new_columns = [col for col in X_tmp.columns if col not in X.columns]
        X_tmp = X.merge(X_tmp[new_columns], left_index=True, right_index=True, how='left')
        
        return X_tmp

    def transform(self, X):
        if not self.inference:
            X = self.create_train_features(X)
        return self.create_payload_features(X)
    
    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)

In [13]:
feature_eng = FeatureEngineering(inference=False)

In [14]:
df_tmp = feature_eng.fit_transform(df)

In [15]:
df_tmp.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,fecha,monto,score,fraude,ymd,day_of_week,hour_of_day,period_of_day
0,4,0.7181,114261.67,35.0,0.0,1.0,BR,6,Phytopower Caps Original Compre 4 Pague 2 - Fr...,cat_d69bed9,0.786613,743.0,65.0,1,Y,N,0.7,114261,0,2020-03-29 19:40:35,23.88,39.0,0,20200329,6,19,evening
1,4,0.7058,54006.36,1.0,0.282594,0.0,BR,34,Mini Modulo Amplificador 4 Canais + Fonte 12v ...,cat_4e2d9a5,0.02568,0.0,0.0,0,NULO,N,0.02,54006,0,2020-03-22 10:16:19,25.33,83.0,0,20200322,6,10,morning
2,2,0.7535,43515.6,14.0,0.0,46.0,UY,36,Radio Auto Multimedia Bluetooth Mp3 Desmontabl...,cat_842c755,0.553464,2729.0,94.0,1,NULO,Y,0.28,260445,100,2020-03-09 21:54:45,39.0,66.0,0,20200309,0,21,evening
3,4,0.8707,796.03,42.0,0.761702,0.0,BR,0,Cupinicida Aerosol 400ml Cupins De Madeira Kel...,cat_27dfebd,0.356607,187.0,184.0,0,NULO,N,0.82,796,0,2020-04-21 14:50:55,4.53,71.0,0,20200421,1,14,afternoon
4,4,0.7535,43515.6,3.0,0.0,1.0,BR,14,Lanterna De Cabeça Profissional Led T6 15000w ...,cat_2579800,0.131092,189.0,189.0,1,NULO,Y,0.04,260445,100,2020-03-13 14:13:18,593.03,64.0,0,20200313,4,14,afternoon
