In [28]:
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from collections import defaultdict

%matplotlib inline
pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv('../data/processed/train-normalized.csv.gz', skipinitialspace = True)

In [3]:
ids = data['ncodpers']
data.drop(['ncodpers'], axis=1, inplace=True)

In [4]:
models = {}
model_preds = {}
id_preds = defaultdict(list)
products = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
            'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
            'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
            'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
            'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
            'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
            'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
            'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']

In [5]:
for prod in products:
    data.ix[data[prod] > 0, prod] = 1

In [29]:
from collections import Counter

for product in products:
        print(product)
        y_train = data[product]
        x_train = data.drop(product, 1)
        
        clf = MultinomialNB()
        clf.fit(x_train, y_train)
        p_train = clf.predict_proba(x_train)[:,1]
        
        models[product] = clf
        model_preds[product] = p_train
        for id, p in zip(ids, p_train):
            id_preds[id].append(p)
            
        print(roc_auc_score(y_train, p_train))

ind_ahor_fin_ult1
0.807638170345
ind_aval_fin_ult1
0.974611636247
ind_cco_fin_ult1
0.769251087472
ind_cder_fin_ult1
0.871356892589
ind_cno_fin_ult1
0.98039320662
ind_ctju_fin_ult1
0.97264474994
ind_ctma_fin_ult1
0.850588818116
ind_ctop_fin_ult1
0.780462069365
ind_ctpp_fin_ult1
0.813355567562
ind_deco_fin_ult1
0.833991440197
ind_deme_fin_ult1
0.887890441555
ind_dela_fin_ult1
0.86842348918
ind_ecue_fin_ult1
0.87337253706
ind_fond_fin_ult1
0.899928189091
ind_hip_fin_ult1
0.935076334515
ind_plan_fin_ult1
0.898496386652
ind_pres_fin_ult1
0.868768939006
ind_reca_fin_ult1
0.874925090566
ind_tjcr_fin_ult1
0.932529289624
ind_valo_fin_ult1
0.889829063108
ind_viv_fin_ult1
0.816670240896
ind_nomina_ult1
0.994051879412
ind_nom_pens_ult1
0.998271993755
ind_recibo_ult1
0.903406013103


In [7]:
already_active = {}
id_prod = pd.concat([ids, data[products]], axis=1)

for row in id_prod.values:    
    row = list(row)
    id = row.pop(0)
    active = [c[0] for c in zip(id_prod.columns[1:], row) if c[1] > 0]
    already_active[id] = active

In [8]:
# Here we store list of already boughted products for each user.
already_active[15889.0]

['ind_cco_fin_ult1',
 'ind_ctpp_fin_ult1',
 'ind_tjcr_fin_ult1',
 'ind_valo_fin_ult1']

In [9]:
def most_popular():
    return ['ind_cco_fin_ult1', 'ind_recibo_ult1', 'ind_ctop_fin_ult1', 'ind_cno_fin_ult1', 'ind_ecue_fin_ult1', 'ind_nom_pens_ult1', 'ind_nomina_ult1']

train_preds = defaultdict(most_popular)

for id, p in id_preds.items():
    # Here be dragons
    preds = [i[0] for i in sorted([i for i in zip(id_prod.columns[1:], p) if i[0] not in already_active[id]], key=lambda i:i [1], reverse=True)[:7]]
    train_preds[id] = preds

In [10]:
train_preds[342874356435]

['ind_cco_fin_ult1',
 'ind_recibo_ult1',
 'ind_ctop_fin_ult1',
 'ind_cno_fin_ult1',
 'ind_ecue_fin_ult1',
 'ind_nom_pens_ult1',
 'ind_nomina_ult1']

In [11]:
test = pd.read_csv('../data/raw/test.csv.zip', skipinitialspace = True, usecols=['ncodpers'])

In [12]:
test['added_products'] = ' '.join(train_preds[-1])
test.head()

Unnamed: 0,ncodpers,added_products
0,15889,ind_cco_fin_ult1 ind_recibo_ult1 ind_ctop_fin_...
1,1170544,ind_cco_fin_ult1 ind_recibo_ult1 ind_ctop_fin_...
2,1170545,ind_cco_fin_ult1 ind_recibo_ult1 ind_ctop_fin_...
3,1170547,ind_cco_fin_ult1 ind_recibo_ult1 ind_ctop_fin_...
4,1170548,ind_cco_fin_ult1 ind_recibo_ult1 ind_ctop_fin_...


In [15]:
test_result = {}

for idx, row in test.iterrows():
    test_result[row.ncodpers] = ' '.join(train_preds[row.ncodpers])
    if idx % 50000 == 0:
        test.head()
        print(idx, 'entries passed')

0 entries passed
50000 entries passed
100000 entries passed
150000 entries passed
200000 entries passed
250000 entries passed
300000 entries passed
350000 entries passed
400000 entries passed
450000 entries passed
500000 entries passed
550000 entries passed
600000 entries passed
650000 entries passed
700000 entries passed
750000 entries passed
800000 entries passed
850000 entries passed
900000 entries passed


In [16]:
test_result

{15889: 'ind_recibo_ult1 ind_ctop_fin_ult1 ind_plan_fin_ult1 ind_dela_fin_ult1 ind_ecue_fin_ult1 ind_fond_fin_ult1 ind_reca_fin_ult1',
 15890: 'ind_valo_fin_ult1 ind_reca_fin_ult1 ind_dela_fin_ult1 ind_hip_fin_ult1 ind_fond_fin_ult1 ind_ctop_fin_ult1 ind_cco_fin_ult1',
 15892: 'ind_ctpp_fin_ult1 ind_ctop_fin_ult1 ind_plan_fin_ult1 ind_fond_fin_ult1 ind_nom_pens_ult1 ind_hip_fin_ult1 ind_deme_fin_ult1',
 15893: 'ind_cco_fin_ult1 ind_ctop_fin_ult1 ind_ecue_fin_ult1 ind_fond_fin_ult1 ind_tjcr_fin_ult1 ind_ctpp_fin_ult1 ind_plan_fin_ult1',
 15894: 'ind_ctpp_fin_ult1 ind_fond_fin_ult1 ind_ctop_fin_ult1 ind_plan_fin_ult1 ind_hip_fin_ult1 ind_cder_fin_ult1 ind_deme_fin_ult1',
 15895: 'ind_fond_fin_ult1 ind_ctpp_fin_ult1 ind_ctop_fin_ult1 ind_hip_fin_ult1 ind_nom_pens_ult1 ind_deme_fin_ult1 ind_viv_fin_ult1',
 15896: 'ind_recibo_ult1 ind_dela_fin_ult1 ind_tjcr_fin_ult1 ind_plan_fin_ult1 ind_fond_fin_ult1 ind_cder_fin_ult1 ind_ctpp_fin_ult1',
 15897: 'ind_nomina_ult1 ind_dela_fin_ult1 ind_ctpp_

In [23]:
predicted = pd.DataFrame(list(test_result.items()), columns=['ncodpers', 'added_products'])

In [24]:
predicted.to_csv('../data/predicted.csv.gz', index=False, compression='gzip')