In [None]:
import pandas as pd
import numpy as np
import pickle

pd.set_option('display.max_columns', None)

In [None]:
!jupyter nbconvert --to script 1-preprocessing.ipynb

In [None]:
import warnings

# Disable typecast warnings
warnings.filterwarnings('ignore')

na_values=['NA', 'nan']

dtype = {'tiprel_1mes': np.str, 'indrel_1mes':np.str, 'fecha_alta': np.str, 'ult_fec_cli_1t': np.str}

iter_csv = pd.read_csv('../data/raw/train.csv.zip', na_values = na_values, skipinitialspace = True, 
                       iterator=True, chunksize=100000, dtype=dtype)

corrupted = pd.concat([chunk for chunk in iter_csv])
corrupted.sort_values(['ncodpers', 'fecha_dato'], inplace=True)
corrupted.reset_index(drop=True, inplace=True)

In [None]:
products = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
            'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
            'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
            'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
            'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
            'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
            'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
            'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']

In [None]:
corrupted.head(17)

In [None]:
personal_products = {}

# this function checks whether any product was bought in provided months a year ago.
def month_last_year(user_history, months):  
    active = []
    
    for m in months:
        curr = '2015-0%d-28' % m
        entry = user_history[user_history.fecha_dato == curr]
        if len(entry) > 0:

            for p in products:
                if entry.iloc[0][p] == 1:
                    active.append(p)
                
    return active
   
# this function collects products which where used more than {percent} of time.
# here {percent} is a fraction from 0 to 1 inclusive. 50% == 0.5
def popular_products(user_history, percent):
    active = []

    for p in products:
        curr = user_history[p][user_history[p].notnull()].values
        usage = (curr == 1).sum()/curr.size
        if usage >= percent:
            active.append(p)
    
    return active

i = 0

def grab_personal(user_history):
    user_history.head()
    global i

    if i % 1000 == 0:
        print('processing entry #%d' % i)
    i += 1
    
    user_history.sort_values(['fecha_dato'], inplace=True)

    potential = []
    potential.extend(month_last_year(user_history, [6,5,7]))
    potential.extend(popular_products(user_history, 0.5))

    active_prods = user_history.iloc[-1][products]

    for p in products:
        if active_prods[p] == 1:
            potential = [e for e in potential if e != p]
            
    personal_products[user_history.ix[0,'ncodpers']] = products

In [None]:
corrupted.groupby(['ncodpers']).apply(grab_personal)

In [None]:
with open('../data/processed/personal.pickle', 'wb') as f:
    pickle.dump(personal_products, f)

personal_products

In [None]:
personal_products

In [None]:
# Remove corrupted entries
print('%d users before cleaning' % len(corrupted.ncodpers.unique()))
corrupted = corrupted[corrupted.age.notnull()]
corrupted = corrupted[corrupted.sexo.notnull()]
print('%d users after cleaning' % len(corrupted.ncodpers.unique()))
corrupted.drop('nomprov', axis=1, inplace=True) # Drop province name. It is 100% useless

In [None]:
corrupted.sort_values(['ncodpers', 'fecha_dato'], inplace=True)
corrupted.drop_duplicates(['ncodpers'], keep='last', inplace=True)
corrupted.reset_index(drop=True, inplace=True)
X_train = corrupted

In [None]:
X_train.head()

In [None]:
def sub_code_and_toint(text):
    return 0.0 if pd.isnull(text) else float(str(text)[1:2])
    
def preprocess_features(X):
    # "converting dates to timestamps"
    X.fecha_alta = X.fecha_alta.astype('int64')//1e9
    X.fecha_alta[X.fecha_alta < 0] = 0

    X.ult_fec_cli_1t = X.ult_fec_cli_1t.astype('int64')//1e9
    X.ult_fec_cli_1t[X.ult_fec_cli_1t < 0] = 0

    # "ind_empleado"
    ind_empleado = pd.get_dummies(X.ind_empleado, prefix = 'ind_empleado')
    X = pd.concat([X, ind_empleado], axis=1)

    # "tiprel_1mes"
    tiprel_1mes = pd.get_dummies(X.tiprel_1mes, prefix = 'tiprel_1mes')
    X = pd.concat([X, tiprel_1mes], axis=1)        
    
    X.segmento = X.segmento.apply(sub_code_and_toint)
    
    X.sexo.replace(to_replace=['V', 'H'], value=[1.0, 0.0], inplace=True)
    X.renta.replace(to_replace=[float('nan')], value=[0.], inplace=True)
    X.indrel.replace(to_replace=[99.], value=[0.], inplace=True)
    X.indext.replace(to_replace=['S', 'N'], value=[1.0, 0.0], inplace=True)
    X.indresi.replace(to_replace=['S', 'N'], value=[1.0, 0.0], inplace=True)    
    X.indfall.replace(to_replace=['S','N'], value=[1.,0.], inplace=True)
    X.cod_prov.replace(to_replace=[float('nan')], value=[0.], inplace=True)
    X.indrel_1mes.replace(to_replace=['P', float('nan')], value=[5., 6.], inplace=True)
    X.canal_entrada.replace(to_replace=[None], value=['nan'], inplace=True)
    
    X.indrel_1mes = pd.to_numeric(X.indrel_1mes)
    
    columns_to_drop = ['conyuemp', 'tipodom', 'fecha_dato', 'ind_empleado', 'tiprel_1mes']
    X.drop(columns_to_drop, axis=1, inplace=True) 
    
    return X

In [None]:
X_train = preprocess_features(X_train)
X_train.head()

### coordinates feature
Load external dictionary of coordinates and replace code with coordinates in dataset

In [None]:
countries = pd.read_csv('../data/external/countries.csv', header=None)
countries.columns=['pais_residencia', 'lat', 'lon']

X_train = X_train.merge(countries, on='pais_residencia')
X_train.sort_values(['ncodpers'], inplace=True)
X_train.reset_index(drop=True, inplace=True)

new_cols_order = ['ncodpers', 'pais_residencia', 'sexo', 'age', 'lat', 'lon', 'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel', 'ind_empleado_A', 'ind_empleado_B', 'ind_empleado_F', 'ind_empleado_N', 'ind_empleado_S', 'ult_fec_cli_1t', 'indrel_1mes', 'tiprel_1mes_A', 'tiprel_1mes_I', 'tiprel_1mes_P', 'tiprel_1mes_R', 'indresi', 'indext', 'indfall', 'cod_prov', 'ind_actividad_cliente', 'renta', 'segmento', 'ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']
X_train = X_train[new_cols_order]

In [None]:
X_train.head()

In [None]:
X_train.to_csv('../data/processed/train-processed.csv.gz', index=False, compression='gzip')