In [None]:
# default_exp features
# default_cls_lvl 2

In [None]:
%load_ext autotime

# Feature Engineering und Target Variablen
> In diesem Modul werden zusätzliche Features berechnet und die Zielvariablen modeliert.

Laut [Kaggle Data Description](https://www.kaggle.com/c/santander-product-recommendation/data) haben wir 1.5 Jahre an Daten. Das besagte Ziel ist es die zusätzlich erworbenen Produkte vorherzusagen für die Periode 2016-05-28. Dies ist eines der wichtigsten Schritte im ganzen ML Prozess. Für diesen PoC halten wir das ganze Recht einfach. In eine echten Projekt würden wir zusätzlich folgende Schritte machen:

 - entwicklen von zusätzlichen Featueres wie z.B. relatives Einkommen zur Altersgruppe und Lokation
 - berechnen von Differenzen zum Vormonat bzw. Vormonaten
 - Zeitversetzte Produkte als Features z.B. hatte der Kunde bereits Produkt X im letzten Monat 

In [None]:
#export
import pandas as pd
import numpy as np

from fastscript import *

time: 138 ms


In [None]:
#export
def load_data(path='data/interim/02_train.csv'):
    """load data"""
    return pd.read_csv(path)

time: 1.03 ms


In [None]:
train = load_data().sample(100000)

time: 19.1 s


In [None]:
train.head(10)

Unnamed: 0,id,ind_empleado,pais_residencia,sexo,age,ind_nuevo,antiguedad,indrel,indrel_1mes,tiprel_1mes,...,month_int,fecha_alta_month,fecha_alta_year,fecha_alta_day,fecha_alta_month_int,fecha_alta_day_int,ult_fec_cli_1t_month,ult_fec_cli_1t_year,ult_fec_cli_1t_day,ult_fec_cli_1t_month_int
740987,224166,0,0,0,49,0,173,1,1,1,...,2,2,6,21,74,2271,1,5,1,61
742202,351179,0,0,0,50,0,159,1,1,1,...,2,4,7,16,88,2691,1,5,1,61
7158784,1434285,0,0,0,20,1,4,1,1,0,...,12,8,20,11,248,7551,1,5,1,61
6710345,1295159,0,0,1,22,0,16,1,1,1,...,12,8,19,7,236,7182,1,5,1,61
4158374,299372,0,0,0,56,0,88,1,1,1,...,8,4,13,25,160,4890,1,5,1,61
5332744,1207608,0,0,0,24,0,24,1,1,1,...,10,10,18,24,226,6894,1,5,1,61
5739010,316607,0,0,0,43,0,165,1,1,0,...,10,1,7,16,85,2601,1,5,1,61
10251485,429249,0,0,0,73,0,154,1,1,1,...,17,7,8,28,103,3158,1,5,1,61
9873191,1147941,0,0,0,27,0,33,1,1,1,...,16,7,18,25,223,6805,1,5,1,61
10574999,1315531,0,0,1,22,0,20,1,1,1,...,17,9,19,24,237,7229,1,5,1,61


time: 9.37 ms


In [None]:
#export
target_cols = ['ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1',
               'ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1',
               'ind_ctma_fin_ult1','ind_ctop_fin_ult1','ind_ctpp_fin_ult1',
               'ind_deco_fin_ult1','ind_deme_fin_ult1','ind_dela_fin_ult1',
               'ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1',
               'ind_plan_fin_ult1','ind_pres_fin_ult1','ind_reca_fin_ult1',
               'ind_tjcr_fin_ult1','ind_valo_fin_ult1','ind_viv_fin_ult1',
               'ind_nomina_ult1','ind_nom_pens_ult1','ind_recibo_ult1']

time: 482 µs


In [None]:
#export
def calculate_targets(df:pd.DataFrame, target_products:list=target_cols):
    """add the shifted product values and calculate target variables"""
    df.sort_values(by = ['id', 'month_int'], inplace=True) #sort by id then by month_int
    df['id_shift_1'] = df['id'].shift(1).fillna(0).astype(np.int32)
    
    idx_to_remove = ((df['id'] - df['id_shift_1']) != 0) #store index unwanted entries
    
    #add shifted target colums
    for col in target_products:    
        name = col + '_s_1'
        df[name] = df[col].shift(1).fillna(0).astype(np.int8)        
        df.loc[idx_to_remove, name] = 0 #set to 0 so that the difference works out
        
    # set 1 only for added products not for existing products
    for col in target_cols:
        df[col] = (df[col] - df[col + '_s_1']).astype(np.int8)
        df[col] = (df[col] > 0).astype(np.int8)
        
    df = df[idx_to_remove == False] #remove illogical results
    return df
    


time: 1.29 ms


In [None]:
df1 = calculate_targets(train, target_cols)
assert (df1['id'] == df1['id_shift_1']).all()

time: 242 ms


In [None]:
@call_parse
def calculate_main(source:Param("source csv file", str)='data/interim/02_train.csv',
                   dest:Param("destination csv file", str)='data/interim/03_train.csv'):
    """calculate target variables and delayed product features"""
    
    data = load_data(source)
    data = calculate_targets(data)
    data.to_csv(dest, index=False)
    return data
    

time: 2.46 ms


In [None]:
calculate_main()

Unnamed: 0,id,ind_empleado,pais_residencia,sexo,age,ind_nuevo,antiguedad,indrel,indrel_1mes,tiprel_1mes,...,ind_hip_fin_ult1_s_1,ind_plan_fin_ult1_s_1,ind_pres_fin_ult1_s_1,ind_reca_fin_ult1_s_1,ind_tjcr_fin_ult1_s_1,ind_valo_fin_ult1_s_1,ind_viv_fin_ult1_s_1,ind_nomina_ult1_s_1,ind_nom_pens_ult1_s_1,ind_recibo_ult1_s_1
796881,15889,3,0,0,56,0,245,1,1,0,...,0,0,0,0,1,1,0,0,0,0
1202651,15889,3,0,0,56,0,245,1,1,0,...,0,0,0,0,0,1,0,0,0,0
1643630,15889,3,0,0,56,0,245,1,1,0,...,0,0,0,0,0,1,0,0,0,0
2514542,15889,3,0,0,56,0,245,1,1,0,...,0,0,0,0,0,1,0,0,0,0
2705368,15889,3,0,0,56,0,245,1,1,0,...,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7652737,1454620,0,0,0,20,1,4,1,1,0,...,0,0,0,0,0,0,0,0,0,0
8230649,1454620,0,0,0,20,1,5,1,1,0,...,0,0,0,0,0,0,0,0,0,0
9036293,1454620,0,0,0,20,0,6,1,1,0,...,0,0,0,0,0,0,0,0,0,0
9798618,1454620,0,0,0,20,0,7,1,1,0,...,0,0,0,0,0,0,0,0,0,0


time: 2min 16s


In [None]:
from nbdev.export import *
notebook2script()

Converted 00_data_prep.ipynb.
Converted 00_data_preparation.ipynb.
Converted 00_data_preparation_CCA.ipynb.
Converted 01_data_preprocess.ipynb.
Converted 02_data_Cleaning.ipynb.
Converted 03_target_vars.ipynb.
Converted 04_base_model.ipynb.
Converted index.ipynb.
time: 96.7 ms
