# Feature Store

Após a análise exploratória tive algumas ideias para criação de novas variáveis, então nessa etapa serão criadas variáveis que não se encontram no dataset/payload e que podem ser construídas a partir de estruturas de feature stores.  

Nessa etapa será criado 1 *feature groups*:  
- Perfil Categoria Produto últimos 7 dias (as variáveis de fraude não serão tão realistas, visto que a maturação da fruade esta sendo desconsiderada)

A ideia desse feature group é trazer informações históricas dos produtos condensados.  

In [142]:
import pandas as pd
import numpy as np
from datetime import datetime


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Data

In [10]:
df = pd.concat([
    pd.read_parquet('../data/processed/fraud_dataset_v2_train.parquet.gzip'),
    pd.read_parquet('../data/processed/fraud_dataset_v2_valid.parquet.gzip'),
    pd.read_parquet('../data/processed/fraud_dataset_v2_test.parquet.gzip')
], ignore_index=True).reset_index(drop=True)

In [11]:
df.head(2)

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,fecha,monto,score,fraude,ymd,day_of_week,hour_of_day,period_of_day
0,4,0.7518,42638.24,50.0,0.19021,9.0,BR,20,Mochila Impermeable Mujer Importada Premiun,cat_0cd53cb,0.364921,4338.0,366.0,1,NULO,Y,0.98,260445,100,2020-03-09 14:33:56,22.18,74.0,0,20200309,0,14,afternoon
1,4,0.7576,34602.36,4.0,0.317697,33.0,BR,7,Placa De Rede ( Pci-e / Pci Express X1 / Mini ...,cat_41464e4,0.056129,4605.0,297.0,1,NULO,Y,0.06,34602,100,2020-03-16 15:07:38,10.95,28.04,0,20200316,0,15,afternoon


# Features

## Perfil Estabelecimento

In [168]:
class FeatureStore:

    key = 'j'
    date_ref = 'ymd'
    window = 7
    delay = 1
    name = None
    path_to_save = None

    def __repr__(self):
        return f'I am {self.name}'

    def save(self):
        self.data.to_parquet(
            f'{self.path_to_save}/{self.name}.parquet.gzip',
            compression='gzip',
            index=False
        )

    def process(self, X):
        print('Hi! Implement this method!')

    def merge(self, X):
        return X.merge(
            self.data,
            how='left',
            on=[self.key, self.date_ref]
        )

class FeatureStoreCategoryProfile(FeatureStore):

    name = 'FeatureStoreCategoryProfile'
    path_to_save = '../data/feature_store'

    def process(self, df):
        # Fill dataset with all possible dates
        dt_init = str(df['ymd'].min())
        dt_end = str(df['ymd'].max())
        all_dates = pd.date_range(start=datetime.strptime(dt_init, '%Y%m%d'), end=datetime.strptime(dt_end, '%Y%m%d'))

        unique_clients = df['j'].unique()
        df_clients_with_all_dates = pd.MultiIndex.from_product([unique_clients, all_dates], names=['j', 'ymd']).to_frame(index=False)
        df_clients_with_all_dates['ymd'] = df_clients_with_all_dates['ymd'].dt.strftime('%Y%m%d').astype(int)

        df_processed = df_clients_with_all_dates.merge(df[['j', 'ymd', 'monto', 'fraude']], on=['j', 'ymd'], how='left')
        
        # Create new Features
        df_processed['monto_fraude'] = df_processed['monto'] * df_processed['fraude']

        # Aggregate
        self.data = (
            df_processed
            .groupby(['j', 'ymd'])
            .agg({
                'monto' : ['sum', 'count'],
                'monto_fraude' : 'sum',
                'fraude' : 'sum'
            })
            .sort_values(by=['j', 'ymd'], ascending=True)
            .reset_index()
            .groupby('j')
            .rolling(window=self.window, min_periods=1)
            .agg({
                ('ymd', '') : 'max',
                ('monto', 'sum') : 'sum',
                ('monto', 'count') : 'sum',
                ('monto_fraude', 'sum') : 'sum',
                ('fraude', 'sum') : 'sum'
            })
            .reset_index(level=0)
        )
        # Fix date ref, to not include current day
        self.data['ymd'] += self.delay
        self.data.columns = ['j', 'ymd', 'vl_cat_last7d', 'qty_cat_last7d', 'vl_fraud_cat_last7d', 'qty_fraud_cat_last7d']

        # New features to this feature store
        self.data['br_vl_cat_last7d'] = np.nan_to_num(self.data['vl_fraud_cat_last7d'] / self.data['vl_cat_last7d'], 
                                                       nan=0, posinf=1e9, neginf=-1e9)
        self.data['br_qty_cat_last7d'] = np.nan_to_num(self.data['qty_fraud_cat_last7d'] / self.data['qty_cat_last7d'], 
                                                       nan=0, posinf=1e9, neginf=-1e9)


In [169]:
fs = FeatureStoreCategoryProfile()

In [170]:
a = fs.process(df)

  .rolling(window=self.window, min_periods=1)


In [171]:
df[df['j'] == 'cat_0005972'].sort_values('ymd', ascending=True)[['monto', 'ymd']]

Unnamed: 0,monto,ymd
44838,2.22,20200309
88450,9.36,20200313
103419,5.63,20200313
117447,2.62,20200324


In [172]:
fs.data.head(20)

Unnamed: 0,j,ymd,vl_cat_last7d,qty_cat_last7d,vl_fraud_cat_last7d,qty_fraud_cat_last7d,br_vl_cat_last7d,br_qty_cat_last7d
0,cat_0005972,20200309.0,0.0,0.0,0.0,0.0,0.0,0.0
1,cat_0005972,20200310.0,2.22,1.0,0.0,0.0,0.0,0.0
2,cat_0005972,20200311.0,2.22,1.0,0.0,0.0,0.0,0.0
3,cat_0005972,20200312.0,2.22,1.0,0.0,0.0,0.0,0.0
4,cat_0005972,20200313.0,2.22,1.0,0.0,0.0,0.0,0.0
5,cat_0005972,20200314.0,17.21,3.0,0.0,0.0,0.0,0.0
6,cat_0005972,20200315.0,17.21,3.0,0.0,0.0,0.0,0.0
7,cat_0005972,20200316.0,17.21,3.0,0.0,0.0,0.0,0.0
8,cat_0005972,20200317.0,14.99,2.0,0.0,0.0,0.0,0.0
9,cat_0005972,20200318.0,14.99,2.0,0.0,0.0,0.0,0.0


In [173]:
fs.save()

In [174]:
fs.merge(df)[['j', 'ymd', 'qty_cat_last7d', 'vl_cat_last7d']]

Unnamed: 0,j,ymd,qty_cat_last7d,vl_cat_last7d
0,cat_0cd53cb,20200309,26.0,964.89
1,cat_41464e4,20200316,20.0,413.53
2,cat_8ef7164,20200325,266.0,4831.37
3,cat_e694239,20200329,30.0,838.89
4,cat_2eabef6,20200314,118.0,3263.94
...,...,...,...,...
249995,cat_6984bab,20200309,0.0,0.00
249996,cat_a492be0,20200421,26.0,693.87
249997,cat_705c98f,20200412,26.0,589.59
249998,cat_6129f1c,20200321,4.0,33.42


In [175]:
df[(df['j'] == 'cat_0cd53cb') & (df['ymd'] < 20200309)]['monto'].agg(['count', 'sum'])

count     26.00
sum      964.89
Name: monto, dtype: float64

Variável criada com sucesso e validada, foram implementadas duas classes para facilitar desenvolvimentos futuros, podendo herdar a classe FeatureStore e realizar apenas implementações específicas