In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from collections import defaultdict

%matplotlib inline
pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv('../data/processed/train-normalized.csv.gz', skipinitialspace = True)
sample = pd.read_csv('../data/sample_submission.csv.zip')

In [3]:
ids = data['ncodpers']
data.drop(['ncodpers'], axis=1, inplace=True)

In [4]:
models = {}
model_preds = {}
id_preds = defaultdict(list)
products = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
            'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
            'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
            'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
            'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
            'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
            'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
            'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']

In [5]:
for prod in products:
    data.ix[data[prod] > 0, prod] = 1

In [6]:
from sklearn.tree import DecisionTreeRegressor
from collections import Counter

for product in products:
        print(product)
        y_train = data[product]
        x_train = data.drop(product, 1)
        
        clf = LogisticRegression()
        clf.fit(x_train, y_train)
        p_train = clf.predict_proba(x_train)[:,1]
        
        models[product] = clf
        model_preds[product] = p_train
        for id, p in zip(ids, p_train):
            id_preds[id].append(p)
            
        print(roc_auc_score(y_train, p_train))

ind_ahor_fin_ult1
0.889947611352
ind_aval_fin_ult1
0.985322655647
ind_cco_fin_ult1
0.816963931059
ind_cder_fin_ult1
0.917073934071
ind_cno_fin_ult1
0.982352612866
ind_ctju_fin_ult1
0.999371882459
ind_ctma_fin_ult1
0.895467796994
ind_ctop_fin_ult1
0.907853957348
ind_ctpp_fin_ult1
0.853040776136
ind_deco_fin_ult1
0.89532168085
ind_deme_fin_ult1
0.925828453828
ind_dela_fin_ult1
0.928724531879
ind_ecue_fin_ult1
0.897128892465
ind_fond_fin_ult1
0.93529136318
ind_hip_fin_ult1
0.963722821896
ind_plan_fin_ult1
0.933202077579
ind_pres_fin_ult1
0.924483578273
ind_reca_fin_ult1
0.904142454006
ind_tjcr_fin_ult1
0.939846720729
ind_valo_fin_ult1
0.923119645312
ind_viv_fin_ult1
0.908051282634
ind_nomina_ult1
0.999234053631
ind_nom_pens_ult1
0.999383813787
ind_recibo_ult1
0.927231268842


In [17]:
i = 0
already_active = {}
id_prod = pd.concat([ids, data[products]], axis=1)
for row in id_prod.values:
    if i % 50000 == 0:
        print(i, row)
    i += 1
    
    row = list(row)
    id = row.pop(0)
    active = [c[0] for c in zip(id_prod.columns[1:], row) if c[1] > 0]
    already_active[id] = active

(0, array([ 657640.,       0.,       0.,       0.,       0.,       0.,
             0.,       0.,       0.,       0.,       0.,       0.,
             0.,       0.,       0.,       0.,       0.,       0.,
             0.,       0.,       0.,       0.,       0.,       0.,       0.]))
(50000, array([  6.87397000e+05,   0.00000000e+00,   0.00000000e+00,
         1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00]))
(100000, array([  6.00517000e+05,   0.00000000e+00,   0.00000000e+00,
         1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
 

In [19]:
already_active

{15889.0: ['ind_cco_fin_ult1',
  'ind_ctpp_fin_ult1',
  'ind_tjcr_fin_ult1',
  'ind_valo_fin_ult1'],
 15890.0: ['ind_cno_fin_ult1',
  'ind_ctpp_fin_ult1',
  'ind_ecue_fin_ult1',
  'ind_plan_fin_ult1',
  'ind_tjcr_fin_ult1',
  'ind_nomina_ult1',
  'ind_nom_pens_ult1',
  'ind_recibo_ult1'],
 15892.0: ['ind_cco_fin_ult1',
  'ind_cno_fin_ult1',
  'ind_dela_fin_ult1',
  'ind_ecue_fin_ult1',
  'ind_reca_fin_ult1',
  'ind_tjcr_fin_ult1',
  'ind_valo_fin_ult1',
  'ind_recibo_ult1'],
 15893.0: ['ind_dela_fin_ult1', 'ind_valo_fin_ult1'],
 15894.0: ['ind_cco_fin_ult1',
  'ind_cno_fin_ult1',
  'ind_dela_fin_ult1',
  'ind_ecue_fin_ult1',
  'ind_reca_fin_ult1',
  'ind_tjcr_fin_ult1',
  'ind_valo_fin_ult1',
  'ind_nomina_ult1',
  'ind_nom_pens_ult1',
  'ind_recibo_ult1'],
 15895.0: ['ind_cco_fin_ult1',
  'ind_cno_fin_ult1',
  'ind_dela_fin_ult1',
  'ind_ecue_fin_ult1',
  'ind_plan_fin_ult1',
  'ind_reca_fin_ult1',
  'ind_tjcr_fin_ult1',
  'ind_valo_fin_ult1',
  'ind_recibo_ult1'],
 15896.0: ['ind_cco

In [18]:
train_preds = {}
for id, p in id_preds.items():
    # Here be dragons
    preds = [i[0] for i in sorted([i for i in zip(id_prod.columns[1:], p) if i[0] not in already_active[id]], key=lambda i:i [1], reverse=True)[:7]]
    train_preds[id] = preds
    
test_preds = []
for row in sample.values:
    id = row[0]
    p = train_preds[id]
    test_preds.append(' '.join(p))

NameError: name 'sample' is not defined