In [None]:
import numpy as np
import pandas as pd
from numpy import dtype
from collections import defaultdict
from datetime import datetime as dt
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline
pd.set_option('display.max_columns', None)

In [None]:
iter_csv = pd.read_csv('../data/processed/train-processed.csv.gz', skipinitialspace = True, 
                       iterator=True, chunksize=50000)

data = pd.concat([chunk for chunk in iter_csv], ignore_index=True)

In [None]:
data.head()

In [None]:
def age_group(age):
    if age < 18: return 0
    if age < 25: return 1
    if age < 30: return 2
    if age < 40: return 3
    if age < 60: return 4
    return 5

# cluster keys: sexo, age group, segmento, cod_prov
def hashcode(user):
    group = age_group(user.age) 
    return (group, user.sexo, user.segmento, user.cod_prov)

In [None]:
clusterid_userids = defaultdict(list)

def split_by_clusters(row):        
    userhash = hashcode(row)
    clusterid_userids[userhash].append(row.ncodpers)
    
_ = data.apply(split_by_clusters, axis=1)

In [None]:
cluster_sizes = {}

for key, val in clusterid_userids.items():
    cluster_sizes[key] = len(val)

print('Median cluster size:', np.median(list(cluster_sizes.values())))

In [None]:
cluster_id = 0

data.loc[:,'cluster'] = 0

for ids in clusterid_userids.values():
    data.cluster[data.ncodpers.isin(ids)] = cluster_id
    cluster_id += 1
    

data.head()

In [None]:
clean_data_without_id = data.drop(['ncodpers', 'pais_residencia', 'cluster'], axis=1)

x = clean_data_without_id.values #returns a numpy array
x_scaled = MinMaxScaler().fit_transform(x)

normalized = pd.DataFrame(x_scaled)

In [None]:
normalized.columns=list(clean_data_without_id.columns.values)
normalized.tail()

In [None]:
normalized_data = pd.concat([data[['cluster', 'ncodpers']], normalized], axis=1)
normalized_data.to_csv('../data/processed/train-normalized.csv.gz', index=False, compression='gzip')

In [None]:
normalized_data.head()