In [None]:
#!jupyter nbconvert --to script 1-preprocessing.ipynb

In [None]:
from collections import defaultdict
import pandas as pd
import numpy as np
import pickle
from os import path
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', None)

In [None]:
import warnings

# Disable typecast warnings
warnings.filterwarnings('ignore')

na_values=['NA', 'nan']

dtype = {'tiprel_1mes':np.str, 'indrel_1mes':np.str, 'fecha_alta':np.str, 'fecha_dato':np.str}

iter_csv = pd.read_csv('../data/raw/train.csv.zip', na_values = na_values, skipinitialspace = True, 
                       iterator=True, chunksize=100000, dtype=dtype)

df = pd.concat([chunk for chunk in iter_csv], ignore_index=True)

In [None]:
products = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
            'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
            'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
            'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
            'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
            'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
            'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
            'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']

In [None]:
segm_rent = defaultdict(set)
df.groupby(['segmento']).apply(lambda x: segm_rent[x.segmento.values[0]].add(x.renta.median()))

keys = []
vals = []

for k, v in segm_rent.items():
    keys.append(k)
    vals.append(list(v)[0])
    
fig, ax = plt.subplots()
ind = np.arange(len(keys))  # the x locations for the groups

ax.bar(ind, vals, align='center')
ax.set_xticks(ind)
ax.set_xticklabels(keys)
plt.show()

In [None]:
prov_rent = defaultdict(set)
df.groupby(['nomprov']).apply(lambda x: prov_rent[x.nomprov.values[0]].add(x.renta.median()))

keys = []
vals = []

for k, v in prov_rent.items():
    keys.append(k)
    vals.append(list(v)[0])
    
fig, ax = plt.subplots(figsize=(20, 15))
ind = np.arange(len(keys))  # the x locations for the groups

ax.bar(ind, vals, align='center')
ax.set_xticks(ind)
ax.set_xticklabels(keys, rotation='vertical')
plt.show()

In [None]:
grouped = df.groupby(["nomprov", "segmento"]).agg({"renta":lambda x: x.median(skipna=True)})

In [None]:
grouped.head()

In [None]:
for (prov, segm) in grouped.index.get_values():
    df.ix[(df.renta.isnull()) & (df.nomprov == prov) & (df.segmento == segm), 'renta'] = grouped.loc[prov, segm].values[0]
    df.ix[(df.renta.isnull()) & (df.nomprov == prov), 'renta'] = grouped.renta.loc[prov, :].mean()
    df.ix[(df.renta.isnull()) & (df.segmento == segm), 'renta'] = grouped.renta.loc[grouped.index.get_level_values('segmento') == segm].mean()
    
df.ix[df.renta.isnull(), 'renta'] = df.renta.mean()

In [None]:
df = df[df.age.notnull()]
df = df[df.sexo.notnull()]
df = df[df.fecha_dato.isin(['2015-05-28', '2015-06-28', '2016-05-28', '2016-06-28'])]
df.sort_values(['ncodpers', 'fecha_dato'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
def sub_code_and_toint(text):
    return 0.0 if pd.isnull(text) else float(str(text)[1:2])
    
def preprocess_features(X):    
    columns_to_drop = ['fecha_alta', 'ult_fec_cli_1t', 'conyuemp',
                       'canal_entrada', 'indfall', 'tipodom', 'nomprov']
    
    dummies = ['ind_empleado', 'indrel_1mes', 'indrel_1mes', 'tiprel_1mes', 'segmento']
    
    X.drop(columns_to_drop, axis=1, inplace=True) 

    # sexo
    X.sexo.replace(to_replace=['V', 'H'], value=[1.0, 0.0], inplace=True)
    
    # ind_empleado
    ind_empleado = pd.get_dummies(X.ind_empleado, prefix = 'ind_empleado')
    X = pd.concat([X, ind_empleado], axis=1)
    
    # ind_nuevo
    X.ix[(X.ind_nuevo.isnull()) & (X.antiguedad.isnull()), 'ind_nuevo'] = 0
    X.ix[(X.ind_nuevo.isnull()) & (X.antiguedad < 6), 'ind_nuevo'] = 1
    X.ix[(X.ind_nuevo.isnull()) & (X.antiguedad >= 6), 'ind_nuevo'] = 0
    
    # indrel
    X.indrel.replace(to_replace=[99.], value=[0.], inplace=True)
    
    # indrel_1mes
    X.indrel_1mes.replace(to_replace=['P', float('nan')], value=[5., 6.], inplace=True)
    X.indrel_1mes = X.indrel_1mes.astype(np.float16)
    indrel_1mes = pd.get_dummies(X.indrel_1mes, prefix = 'indrel_1mes')
    X = pd.concat([X, indrel_1mes], axis=1)   
    
    # tiprel_1mes
    X.tiprel_1mes.replace(to_replace=[float('nan')], value=['N'], inplace=True)
    tiprel_1mes = pd.get_dummies(X.tiprel_1mes, prefix = 'tiprel_1mes')
    X = pd.concat([X, tiprel_1mes], axis=1)   
    
    # indresi
    X.indresi.replace(to_replace=['S', 'N'], value=[1.0, 0.0], inplace=True) 
    
    # indext
    X.indext.replace(to_replace=['S', 'N'], value=[1.0, 0.0], inplace=True) 
    
    # cod_prov
    X.cod_prov.replace(to_replace=[float('nan')], value=[0.], inplace=True)
    
    # segmento
    segmento = pd.get_dummies(X.segmento, prefix = 'segmento')
    X = pd.concat([X, segmento], axis=1)  
    
    X.drop(dummies, axis=1, inplace=True)   
    
    X.ix[X[products].isnull().any(axis=1)] = 0
    
    return X

In [None]:
df = preprocess_features(df)

In [None]:
df.drop(['pais_residencia'], axis=1, inplace=True)
df.head()

In [None]:
df.to_csv('../data/processed/train-processed.csv.gz', index=False, compression='gzip')