In [1]:
import pickle
import numpy as np
import pandas as pd
from os import path

from collections import defaultdict, Counter
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

%matplotlib inline
pd.set_option('display.max_columns', None)

In [2]:
products = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
            'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
            'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
            'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
            'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
            'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
            'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
            'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']

In [3]:
df = pd.read_csv('../data/processed/train-normalized.csv.gz', skipinitialspace = True)
test = pd.read_csv('../data/raw/test.csv.zip', skipinitialspace = True, usecols=['ncodpers'])

In [4]:
test = test.merge(df, how='left', on='ncodpers', copy=False, sort=True)

In [5]:
def most_popular():
    return ['ind_cco_fin_ult1', 'ind_recibo_ult1', 'ind_ctop_fin_ult1', 'ind_cno_fin_ult1', 'ind_ecue_fin_ult1', 'ind_nom_pens_ult1', 'ind_nomina_ult1']

recommendations = defaultdict(most_popular)

PERS_PRODS = '../data/processed/personal.pickle'

if path.isfile(PERS_PRODS):
    print('File already exists. Start loading...')
    with open(PERS_PRODS, 'rb') as f:
        recommendations.update(pickle.load(f))
    print('Done!')
else:
    print('Pickle file is not presented. Using defaultdict.')
    recommendations = defaultdict(list)
    
print('Recommendations length', len(recommendations))

File already exists. Start loading...
Done!
Recommendations length 956645


In [6]:
cluster_products = defaultdict(list)

for c in df.cluster.unique():
    # Collect ids of each cluster
    ids = df.ncodpers[df.cluster == c].values
    all_products = []
    
    for id_ in ids:
        [all_products.append(p) for p in recommendations[id_]]
        
    cluster_products[c] = [tup[0] for tup in sorted(Counter(all_products).items(), key=lambda tup: tup[1], reverse=True)]

In [7]:
result = {}

for idx, row in test.iterrows():
    cluster = row.cluster
    prods = most_popular() if cluster is None else recommendations[row.ncodpers]
    
    if len(prods) < 7:
        cluster = row.cluster
        for p in cluster_products[cluster]:
            if (p not in prods) and (row[p] != 1) and (len(prods) < 7):
                prods.append(p)
                if len(prods) == 7: break
                    
    result[row.ncodpers] = ' '.join(prods[0:7])

In [19]:
predictions = pd.DataFrame(list(result.items()), columns=['ncodpers', 'added_products'], dtype=np.int32)

In [18]:
predictions.to_csv('../data/predicted.csv.gz', index=False, compression='gzip')