In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [None]:
na_values=['NA', 'nan']

# Store labels as string because later we will concatinate them
dtype = {'fecha_dato':np.str, 'fecha_alta':np.str}

iter_csv = pd.read_csv('../data/raw/train.csv.zip', na_values = na_values, skipinitialspace = True, 
                       iterator=True, chunksize=100000, dtype=dtype)

corrupted = pd.concat([chunk for chunk in iter_csv])

In [None]:
# Remove corrupted entries
print('%d users before cleaning' % len(corrupted.ncodpers.unique()))
corrupted = corrupted[corrupted.age.notnull()]
corrupted = corrupted[corrupted.sexo.notnull()]
print('%d users after cleaning' % len(corrupted.ncodpers.unique()))
corrupted.drop('nomprov', axis=1, inplace=True) # Drop province name. It is 100% useless

In [None]:
corrupted.sort_values(['ncodpers', 'fecha_dato'], inplace=True)
corrupted.drop_duplicates(['ncodpers'], keep='last', inplace=True)
corrupted.reset_index(drop=True, inplace=True)
X_train = corrupted

In [None]:
X_train.head()

In [None]:
def sub_code_and_toint(text):
    return 0.0 if pd.isnull(text) else float(str(text)[1:2])
    
def preprocess_features(X):
    X.ind_empleado.replace(to_replace=[1.0, 99.0], value=[1.0, 0.0], inplace=True)
    
    X.indext.replace(to_replace=['S', 'N'], value=[1.0, 0.0], inplace=True)
    X.indresi.replace(to_replace=['S', 'N'], value=[1.0, 0.0], inplace=True)
    
    X.sexo.replace(to_replace=['V', 'H'], value=[1.0, 0.0], inplace=True)
    
    X.segmento = X.segmento.apply(sub_code_and_toint)

In [None]:
preprocess_features(X_train)
X_train.head()

In [None]:
X_train.drop(['fecha_dato'], axis=1, inplace=True)
X_train.head()

In [None]:
X_train.to_csv('../data/processed/train-processed.csv.gz', index=False, compression='gzip')