In [1]:
import numpy as np
import pandas as pd
from numpy import dtype
from collections import defaultdict
from datetime import datetime as dt
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline
pd.set_option('display.max_columns', None)

In [None]:
iter_csv = pd.read_csv('../data/processed/train-processed.csv.gz', skipinitialspace = True, 
                       iterator=True, chunksize=50000)

df = pd.concat([chunk for chunk in iter_csv], ignore_index=True)

In [3]:
df.head()

Unnamed: 0,ncodpers,pais_residencia,sexo,age,ind_nuevo,antiguedad,indrel,indresi,indext,cod_prov,ind_actividad_cliente,renta,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1,ind_empleado_A,ind_empleado_B,ind_empleado_F,ind_empleado_N,ind_empleado_S,indrel_1mes_5.0,indrel_1mes_6.0,indrel_1mes_1,indrel_1mes_1.0,indrel_1mes_2,indrel_1mes_2.0,indrel_1mes_3,indrel_1mes_3.0,indrel_1mes_4,indrel_1mes_4.0,tiprel_1mes_A,tiprel_1mes_I,tiprel_1mes_N,tiprel_1mes_P,tiprel_1mes_R,segmento_01 - TOP,segmento_02 - PARTICULARES,segmento_03 - UNIVERSITARIO
0,15889,ES,1.0,56.0,0.0,255.0,1.0,1.0,0.0,28.0,1.0,326124.9,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,15890,ES,1.0,63.0,0.0,256.0,1.0,1.0,0.0,28.0,1.0,71461.2,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1.0,1.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,15891,ES,0.0,59.0,0.0,246.0,0.0,1.0,0.0,28.0,0.0,146398.13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,15892,ES,0.0,62.0,0.0,256.0,1.0,1.0,0.0,28.0,1.0,430477.41,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,1,0,0.0,0.0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,15893,ES,1.0,63.0,0.0,256.0,1.0,1.0,0.0,28.0,1.0,430477.41,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [4]:
def age_group(user):
#    if user.age < 18: return 0
    if user.age < 25: return 1
    if user.age < 35: return 2
    if user.age < 50: return 3
    else : return 4

def prov_group(user):
    if user.renta < 8000: return 0
    if user.renta < 10000: return 1
    else: return 2

def segmento_group(user):
    if user['segmento_01 - TOP'] == 1: return 1
    if user['segmento_02 - PARTICULARES'] == 1: return 2
    else: return 3

def hashcode(user):
    age_gr = age_group(user) 
    segmento_gr = segmento_group(user) 
    return (age_gr, segmento_gr, user.sexo)

In [5]:
clusterid_userids = defaultdict(list)

def split_by_clusters(row):        
    userhash = hashcode(row)
    clusterid_userids[userhash].append(row.ncodpers)
    
_ = df.apply(split_by_clusters, axis=1)

In [6]:
clusters_sizes = []

for key, val in clusterid_userids.items():
    clusters_sizes.append((key, len(val)))

In [7]:
clusters_sizes

[((3, 2, 0.0), 103206),
 ((1, 2, 1.0), 10223),
 ((3, 1, 1.0), 8237),
 ((3, 2, 1.0), 155050),
 ((3, 1, 0.0), 5105),
 ((1, 3, 1.0), 91081),
 ((4, 1, 0.0), 7799),
 ((3, 3, 1.0), 2789),
 ((1, 3, 0.0), 123578),
 ((4, 1, 1.0), 14117),
 ((2, 3, 0.0), 75189),
 ((3, 3, 0.0), 1857),
 ((1, 1, 1.0), 35),
 ((4, 3, 0.0), 1688),
 ((2, 3, 1.0), 63200),
 ((1, 1, 0.0), 37),
 ((4, 3, 1.0), 2474),
 ((2, 1, 0.0), 191),
 ((2, 2, 1.0), 26761),
 ((2, 1, 1.0), 293),
 ((4, 2, 1.0), 138792),
 ((2, 2, 0.0), 19868),
 ((4, 2, 0.0), 85977),
 ((1, 2, 0.0), 8085)]

In [10]:
cluster_id = 0

df.loc[:,'cluster'] = 0

for ids in clusterid_userids.values():
    df.cluster[df.ncodpers.isin(ids)] = cluster_id
    cluster_id += 1
    

df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,ncodpers,pais_residencia,sexo,age,ind_nuevo,antiguedad,indrel,indresi,indext,cod_prov,ind_actividad_cliente,renta,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1,ind_empleado_A,ind_empleado_B,ind_empleado_F,ind_empleado_N,ind_empleado_S,indrel_1mes_5.0,indrel_1mes_6.0,indrel_1mes_1,indrel_1mes_1.0,indrel_1mes_2,indrel_1mes_2.0,indrel_1mes_3,indrel_1mes_3.0,indrel_1mes_4,indrel_1mes_4.0,tiprel_1mes_A,tiprel_1mes_I,tiprel_1mes_N,tiprel_1mes_P,tiprel_1mes_R,segmento_01 - TOP,segmento_02 - PARTICULARES,segmento_03 - UNIVERSITARIO,cluster
0,15889,ES,1.0,56.0,0.0,255.0,1.0,1.0,0.0,28.0,1.0,326124.9,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9
1,15890,ES,1.0,63.0,0.0,256.0,1.0,1.0,0.0,28.0,1.0,71461.2,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1.0,1.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9
2,15891,ES,0.0,59.0,0.0,246.0,0.0,1.0,0.0,28.0,0.0,146398.13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,22
3,15892,ES,0.0,62.0,0.0,256.0,1.0,1.0,0.0,28.0,1.0,430477.41,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,1,0,0.0,0.0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6
4,15893,ES,1.0,63.0,0.0,256.0,1.0,1.0,0.0,28.0,1.0,430477.41,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,20


In [11]:
df.drop(['pais_residencia', 'cod_prov'], axis=1, inplace=True)

clean_data_without_id = df.drop(['ncodpers', 'cluster'], axis=1)

x = clean_data_without_id.values #returns a numpy array
x_scaled = MinMaxScaler().fit_transform(x)

normalized = pd.DataFrame(data=x_scaled, columns=clean_data_without_id.columns.values)

In [12]:
normalized.tail()

Unnamed: 0,sexo,age,ind_nuevo,antiguedad,indrel,indresi,indext,ind_actividad_cliente,renta,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1,ind_empleado_A,ind_empleado_B,ind_empleado_F,ind_empleado_N,ind_empleado_S,indrel_1mes_5.0,indrel_1mes_6.0,indrel_1mes_1,indrel_1mes_1.0,indrel_1mes_2,indrel_1mes_2.0,indrel_1mes_3,indrel_1mes_3.0,indrel_1mes_4,indrel_1mes_4.0,tiprel_1mes_A,tiprel_1mes_I,tiprel_1mes_N,tiprel_1mes_P,tiprel_1mes_R,segmento_01 - TOP,segmento_02 - PARTICULARES,segmento_03 - UNIVERSITARIO
945627,1.0,0.308642,1.0,0.999744,1.0,1.0,0.0,0.0,0.00223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
945628,0.0,0.17284,1.0,0.999744,1.0,1.0,1.0,0.0,0.003429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
945629,1.0,0.117284,1.0,0.999744,1.0,1.0,0.0,0.0,0.005025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
945630,0.0,0.253086,1.0,0.999744,1.0,1.0,0.0,0.0,0.003409,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
945631,1.0,0.234568,1.0,0.999744,1.0,1.0,0.0,0.0,0.002551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [14]:
normalized_data = pd.concat([df[['cluster', 'ncodpers']], normalized], axis=1)
normalized_data.reset_index(drop=True, inplace=True)
normalized_data.to_csv('../data/processed/train-normalized.csv.gz', index=False, compression='gzip')

In [15]:
normalized_data.head()

Unnamed: 0,cluster,ncodpers,sexo,age,ind_nuevo,antiguedad,indrel,indresi,indext,ind_actividad_cliente,renta,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1,ind_empleado_A,ind_empleado_B,ind_empleado_F,ind_empleado_N,ind_empleado_S,indrel_1mes_5.0,indrel_1mes_6.0,indrel_1mes_1,indrel_1mes_1.0,indrel_1mes_2,indrel_1mes_2.0,indrel_1mes_3,indrel_1mes_3.0,indrel_1mes_4,indrel_1mes_4.0,tiprel_1mes_A,tiprel_1mes_I,tiprel_1mes_N,tiprel_1mes_P,tiprel_1mes_R,segmento_01 - TOP,segmento_02 - PARTICULARES,segmento_03 - UNIVERSITARIO
0,9,15889,1.0,0.333333,0.0,0.999999,1.0,1.0,0.0,1.0,0.011246,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,9,15890,1.0,0.376543,0.0,1.0,1.0,1.0,0.0,1.0,0.002432,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,22,15891,0.0,0.351852,0.0,0.99999,0.0,1.0,0.0,0.0,0.005025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,6,15892,0.0,0.37037,0.0,1.0,1.0,1.0,0.0,1.0,0.014857,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,20,15893,1.0,0.376543,0.0,1.0,1.0,1.0,0.0,1.0,0.014857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
