In [8]:
import numpy as np
import pandas as pd
from numpy import dtype
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from datetime import datetime as dt
from sklearn.metrics import silhouette_score, calinski_harabaz_score
%matplotlib inline
pd.set_option('display.max_columns', None)

In [3]:
dtype={'tiprel_1mes': np.str, 'ult_fec_cli_1t':np.str, 'indrel_1mes':np.str}

parse_dates = ['fecha_alta', 'ult_fec_cli_1t']

iter_csv = pd.read_csv('../data/processed/train-processed.csv.gz', skipinitialspace = True, 
                       iterator=True, chunksize=50000, dtype=dtype, parse_dates=parse_dates)

#data = pd.concat([chunk[chunk['ncodpers'].between(660248, 660250)] for chunk in iter_csv]) #Small dataset could be used for testing
data = pd.concat([chunk for chunk in iter_csv], ignore_index=True)

  if self.run_code(code, result):


In [4]:
# "removed corrupted data and redundant"
data = data[data.sexo.notnull()]
data.drop('conyuemp', axis=1, inplace=True) # Contains just single diff value. All others same
data.drop('tipodom', axis=1, inplace=True)

# "converting dates to timestamps"
data.fecha_alta = data.fecha_alta.astype('int64')//1e9
data.ult_fec_cli_1t = data.ult_fec_cli_1t.astype('int64')//1e9

# "ind_empleado"
ind_empleado = pd.get_dummies(data.ind_empleado, prefix = 'ind_empleado')
data.drop('ind_empleado', axis=1, inplace=True)
data = pd.concat([data, ind_empleado], axis=1)

# "tiprel_1mes"
tiprel_1mes = pd.get_dummies(data.tiprel_1mes, prefix = 'tiprel_1mes')
data.drop('tiprel_1mes', axis=1, inplace=True)
data = pd.concat([data, tiprel_1mes], axis=1)

data.canal_entrada = data.canal_entrada.replace([None], ['nan'])
data.renta = data.renta.replace([float('nan')], [0.])
data.cod_prov = data.cod_prov.replace([float('nan')], [0.])
data.indrel = data.indrel.replace([99.], [0.])
data.indrel_1mes = data.indrel_1mes.replace(['P', float('nan')], [5., 6.])
data.indrel_1mes = pd.to_numeric(data.indrel_1mes)

data.indfall = data.indfall.replace(['S','N'], [1.,0.])
data.reset_index(drop=True, inplace=True)

### coordinates feature
Load external dictionary of coordinates and replace code with coordinates in dataset

In [5]:
countries = pd.read_csv('../data/external/countries.csv', header=None)
countries.columns=['pais_residencia', 'lat', 'lon']

train_countries_df = pd.DataFrame(data=data[['ncodpers','pais_residencia']])
coordinates_feature = pd.merge(train_countries_df, countries, on = 'pais_residencia')

data = pd.merge(data, coordinates_feature, on = 'ncodpers', suffixes=('_x', '_y'))
data.drop(['pais_residencia_x', 'pais_residencia_y'], axis=1, inplace=True)
data.tail()

Unnamed: 0,ncodpers,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,ult_fec_cli_1t,indrel_1mes,indresi,indext,canal_entrada,indfall,cod_prov,ind_actividad_cliente,renta,segmento,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1,ind_empleado_A,ind_empleado_B,ind_empleado_F,ind_empleado_N,ind_empleado_S,tiprel_1mes_A,tiprel_1mes_I,tiprel_1mes_P,tiprel_1mes_R,lat,lon
931181,1166765,1.0,22.0,1376438400,0.0,33.0,1.0,-9223372037,1.0,1.0,0.0,KHE,0.0,50.0,0.0,43912.17,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,40.0,-4.0
931182,1166764,1.0,23.0,1376438400,0.0,33.0,1.0,-9223372037,1.0,1.0,0.0,KHE,0.0,26.0,0.0,23334.99,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,40.0,-4.0
931183,1166763,0.0,47.0,1376438400,0.0,33.0,1.0,-9223372037,1.0,1.0,0.0,KHE,0.0,50.0,1.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,40.0,-4.0
931184,1166789,0.0,22.0,1376438400,0.0,33.0,1.0,-9223372037,1.0,1.0,0.0,KHE,0.0,50.0,0.0,199592.82,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,40.0,-4.0
931185,1550586,0.0,37.0,1463097600,1.0,0.0,1.0,-9223372037,6.0,1.0,0.0,,0.0,28.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,40.0,-4.0


In [6]:
for col, col_data in data.iteritems():
    if col != 'antiguedad' and not col.endswith('_ult1'):
        print (col, col_data.unique())
        print ("###")

('ncodpers', array([ 657640,  657788,  657795, ..., 1166763, 1166789, 1550586]))
###
('sexo', array([ 0.,  1.]))
###
('age', array([  68.,   42.,   44.,   49.,   32.,   36.,   41.,   45.,   31.,
         34.,   37.,   40.,   57.,   51.,   66.,   35.,   38.,   95.,
         43.,   29.,   47.,   33.,   52.,   39.,   74.,   46.,   61.,
         79.,   94.,   53.,   30.,   54.,   50.,   56.,   80.,   92.,
         48.,   89.,   77.,   86.,   76.,   26.,   55.,   62.,   70.,
         67.,   22.,   60.,   71.,   59.,   85.,   58.,   65.,   88.,
         63.,   90.,   78.,   64.,   87.,   12.,   83.,   18.,   16.,
         17.,   91.,   73.,   69.,   11.,   75.,   72.,   81.,   82.,
         84.,   13.,   15.,   96.,  100.,   93.,   19.,  110.,   97.,
         20.,  101.,   99.,   14.,   25.,   98.,  103.,   27.,   23.,
         24.,  105.,   28.,  102.,   21.,  104.,  108.,  106.,  111.,
        112.,  109.,   10.,  114.,  107.,    2.,  113.,    7.,    8.,
          9.,    6.,  127.,  117., 

In [12]:
clean_data = data.drop('canal_entrada', axis=1)

print('calculating')
for k in range(2,11):
    clusterer = GaussianMixture(n_components=k).fit(clean_data)
    print('fitted')
    
    # TODO: Predict the cluster for each data point
    preds = clusterer.predict(clean_data)
    print('predicted')
    
    score = calinski_harabaz_score(clean_data, preds)
    print ("Number of clusters:", k, "; Score:", score)

calculating
fitted
predicted
('Number of clusters:', 2, '; Score:', 6653599.6473168414)
fitted
predicted
('Number of clusters:', 3, '; Score:', 4465756.0577354953)
fitted
predicted
('Number of clusters:', 4, '; Score:', 3020159.6067723446)
fitted
predicted
('Number of clusters:', 5, '; Score:', 2285176.9352828851)
fitted
predicted
('Number of clusters:', 6, '; Score:', 3417838.8427824453)
fitted
predicted
('Number of clusters:', 7, '; Score:', 1461230.0199320302)
fitted
predicted
('Number of clusters:', 8, '; Score:', 1263253.7009416071)
fitted
predicted
('Number of clusters:', 9, '; Score:', 1275081.042925426)
fitted
predicted
('Number of clusters:', 10, '; Score:', 1085629.5900622911)


In [None]:
#Just looking on correlation of some features which potentually might correlate.
pd.scatter_matrix(data.ix[:,['age','renta','antiguedad','segmento']], alpha = 0.3, figsize = (14,8), diagonal = 'kde');