In [None]:
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from collections import defaultdict

%matplotlib inline
pd.set_option('display.max_columns', None)

In [None]:
data = pd.read_csv('../data/processed/train-normalized.csv.gz', skipinitialspace = True)
data.head()

In [None]:
ids = data['ncodpers']
data.drop(['ncodpers'], axis=1, inplace=True)

In [None]:
models = {}
model_preds = {}
id_preds = defaultdict(list)
products = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
            'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
            'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
            'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
            'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
            'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
            'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
            'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']

In [None]:
for product in products:
        print(product)
        y_train = data[product]
        x_train = data.drop(product, 1)
        
        clf = LogisticRegression()
        clf.fit(x_train, y_train)
        p_train = clf.predict_proba(x_train)[:,1]
        models[product] = clf
        model_preds[product] = p_train
        for id, p in zip(ids, p_train):
            id_preds[id].append(p)
            
        print(roc_auc_score(y_train, p_train))

In [None]:
already_active = defaultdict(list)

for i in range(len(data)):  
    id_ = ids.at[i]
    active_products = []
    
    for p in products:
        if data.at[i, p] == 1.0:
            active_products.append(p)
    
    already_active[id_] = active_products

In [None]:
test = pd.read_csv('../data/raw/test.csv.zip', skipinitialspace = True, usecols=['ncodpers'])

In [None]:
most_popular = ['ind_cco_fin_ult1', 'ind_recibo_ult1', 'ind_ctop_fin_ult1', 'ind_cno_fin_ult1', 'ind_ecue_fin_ult1', 'ind_nom_pens_ult1', 'ind_nomina_ult1']

train_preds = {}

for id_ in test.ncodpers.values:
    preds_probas = id_preds[id_]
    active = already_active[id_]    
    predicted = []
    
    for prod_proba_idx in np.argsort(preds_probas)[::-1]:
        if products[prod_proba_idx] not in active and len(predicted) < 7:
            predicted.append(products[prod_proba_idx])
    
    train_preds[id_] = most_popular if len(predicted) == 0 else predicted

In [None]:
for ncodper in test.ncodpers.values:        
    train_preds[ncodper] = ' '.join(train_preds[ncodper])

In [None]:
predictions = pd.DataFrame(list(train_preds.items()), columns=['ncodpers', 'added_products'])
predictions.head()

In [None]:
predictions.to_csv('../data/predicted.csv.gz', index=False, compression='gzip')