In [None]:
import numpy as np
import pandas as pd
from numpy import dtype
from sklearn.mixture import GMM
from sklearn.cluster import KMeans
from datetime import datetime as dt
from sklearn.metrics import silhouette_score
%matplotlib inline
pd.set_option('display.max_columns', None)

In [None]:
dtype={'tiprel_1mes': np.str, 'ult_fec_cli_1t':np.str, 'indrel_1mes':np.str}

parse_dates = ['fecha_alta', 'ult_fec_cli_1t']

iter_csv = pd.read_csv('../data/processed/train-processed.csv.gz', skipinitialspace = True, 
                       iterator=True, chunksize=50000, dtype=dtype, parse_dates=parse_dates)

#corrupted = pd.concat([chunk[chunk['ncodpers'].between(660248, 660250)] for chunk in iter_csv]) #Small dataset could be used for testing
data = pd.concat([chunk for chunk in iter_csv], ignore_index=True)

In [None]:
data.head()

In [None]:
#removed corrupted data and redundant
data = data[data.sexo.notnull()]
data.drop('conyuemp', axis=1, inplace=True) # Contains just single diff value. All others same
data.drop('tipodom', axis=1, inplace=True)

#converting dates to timestamps
data.fecha_alta = data.fecha_alta.astype('int64')//1e9
data.ult_fec_cli_1t = data.ult_fec_cli_1t.astype('int64')//1e9

#ind_empleado
ind_empleado = pd.get_dummies(data.ind_empleado, prefix = 'ind_empleado')
data.drop('ind_empleado', axis=1, inplace=True)
data = pd.concat([data, ind_empleado], axis=1)

#tiprel_1mes
tiprel_1mes = pd.get_dummies(data.tiprel_1mes, prefix = 'tiprel_1mes')
data.drop('tiprel_1mes', axis=1, inplace=True)
data = pd.concat([data, tiprel_1mes], axis=1)

data.indrel = data.indrel.replace([99.], [0.])
data.indrel_1mes = data.indrel_1mes.replace(['P'], [5.])
data.indrel_1mes = pd.to_numeric(data.indrel_1mes)

data.indfall = data.indfall.replace(['S','N'], [1.,0.])

In [None]:
for col, col_data in data.iteritems():
    if col != 'antiguedad' and not col.endswith('_ult1'):
        print col, col_data.unique()
        print "###"

In [None]:
clean_data = data.query('(indrel_1mes == indrel_1mes)')
clean_data = clean_data.query('(cod_prov == cod_prov)')
clean_data = clean_data.query('(canal_entrada == canal_entrada)')
clean_data = clean_data.drop(['pais_residencia','canal_entrada', 'renta'], axis=1)

print('calculating')
for k in range(2,11):
    clusterer = GMM(n_components=k).fit(clean_data)
    print('fitted')
    
    # TODO: Predict the cluster for each data point
    preds = clusterer.predict(clean_data)
    print('predicted')
    
    score = silhouette_score(clean_data, preds)
    print "Number of clusters:", k, "; Score:", score

In [None]:
#Just looking on correlation of some features which potentually might correlate.
pd.scatter_matrix(data.ix[:,['age','renta','antiguedad','segmento']], alpha = 0.3, figsize = (14,8), diagonal = 'kde');