In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import gc; gc.enable()

from utilities import DfLowMemory, DfLowMemoryTest
from utilities import CleanData, CleanTestData

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from category_encoders import WOEEncoder, TargetEncoder
from xgboost import XGBClassifier
from imxgboost.imbalance_xgb import imbalance_xgboost as imb_xgb

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = DfLowMemory('train_ver2.csv')

In [3]:
df_train = CleanData(df_train)

In [4]:
df_test = pd.read_csv('test_data.csv')

In [5]:
df_test = CleanTestData(df_test)

In [6]:
df_train_may15 = df_train[df_train['fecha_dato'] == '2015-05-28']
df_train_June15 = df_train[df_train['fecha_dato'] == '2015-06-28']
df_train_may16 = df_train[df_train['fecha_dato'] == '2016-05-28']

In [7]:
df = pd.concat([df_train_may15, df_train_June15, df_train_may16])

In [8]:
df.drop_duplicates(subset='ncodpers',keep='last', inplace=True)

In [9]:
targets = [c for c in df.columns.tolist() if '_ult1' in c]

In [14]:
used_cols = [c for c in df.columns.tolist() if c not in ['ncodpers', 'fecha_dato', 'fecha_alta',
'ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1','ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1','ind_ctma_fin_ult1',
 'ind_ctop_fin_ult1','ind_ctpp_fin_ult1','ind_deco_fin_ult1','ind_deme_fin_ult1','ind_dela_fin_ult1','ind_ecue_fin_ult1','ind_fond_fin_ult1',
 'ind_hip_fin_ult1','ind_plan_fin_ult1','ind_pres_fin_ult1','ind_reca_fin_ult1','ind_tjcr_fin_ult1','ind_valo_fin_ult1','ind_viv_fin_ult1',
 'ind_nomina_ult1', 'ind_nom_pens_ult1','ind_recibo_ult1']]

In [17]:
used_test_cols = used_cols = [c for c in df_test.columns.tolist() if c not in ['ncodpers', 'fecha_dato', 'fecha_alta']]

## Model Training

In [22]:
id_preds = defaultdict(list)
ids = df_test['ncodpers'].values

In [24]:
for target in targets:
    X = df[used_cols].values
    y = df[target]

    encoder = TargetEncoder()
    X = encoder.fit_transform(X, y)
    model = imb_xgb(special_objective='focal', focal_gamma=2.0)
    model.fit(X.values, y.values)
    
    X_test = df[used_test_cols].values
    X_test = encoder.transform(X_test)
    
    y_pred = model.predict_sigmoid(X_test.values)
    
    for i, cust_id in enumerate(ids):
        id_preds[cust_id].append(y_pred[i])

it1
it1
it1
it1
it1
it1
it1
it1
it1
it1
it1
it1
it1
it1
it1
it1
it1
it1
it1
it1
it1
it1
it1
it1


In [178]:
trial = dict(id_preds)

In [179]:
list_of_prods = {}

for customer in trial.keys():
    list_of_prods[customer] = np.argsort(trial[customer])

In [224]:
target_dict = {}
for i, target in enumerate(targets):
    #target_dict[target] = i
     target_dict[i] = target
    

In [226]:
target_dict_names = {}

for i, target in enumerate(target_names):
     target_dict_names[i] = target

In [225]:
target_names = ['Saving Account', 'Guarantees', 'Current Accounts', 'Derivada Account', 'Payroll Account', 'Junior Account',
               'More Popular Account', 'Particular Account', 'Particular Plus Account', 'Short-term deposits',
               'Medium-term deposits', 'Long-term deposits', 'e-account', 'Funds', 'Mortgage', 'Pensions', 'Loans', 'Taxes',
               'Credit Card', 'Securities', 'Home Account', 'Payroll', 'Pensions2', 'Direct Debit']

In [155]:
gc.collect()

7268

In [195]:
user_frame = pd.DataFrame(list(list_of_prods), columns=['user'])
user_frame.head()

Unnamed: 0,user
0,15889
1,1170544
2,1170545
3,1170547
4,1170548


In [196]:
user_frame['prods'] = user_frame['user'].map(list_of_prods)
user_frame.head()

Unnamed: 0,user,prods
0,15889,"[14, 1, 10, 5, 9, 11, 15, 0, 22, 21, 3, 18, 13..."
1,1170544,"[14, 1, 10, 5, 8, 20, 15, 0, 19, 22, 3, 13, 16..."
2,1170545,"[1, 5, 0, 3, 14, 10, 7, 8, 20, 16, 13, 15, 11,..."
3,1170547,"[14, 1, 10, 5, 9, 11, 15, 0, 22, 21, 3, 13, 18..."
4,1170548,"[14, 1, 10, 5, 9, 11, 15, 0, 22, 21, 3, 13, 18..."


In [228]:
trial_frame = user_frame[:100000]

In [229]:
a = trial_frame['prods'].apply(lambda x: pd.Series(x).map(target_dict_names))

In [230]:
a['user_id'] = user_frame['user'][:100000]

In [231]:
cols = list(a)
# move the column to head of list using index, pop and insert
cols.insert(0, cols.pop(cols.index('user_id')))
a = a.ix[:, cols]

In [233]:
a.head()

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,14,15,16,17,18,19,20,21,22,23
0,15889,Mortgage,Guarantees,Medium-term deposits,Junior Account,Short-term deposits,Long-term deposits,Pensions,Saving Account,Pensions2,...,Securities,Loans,Payroll Account,Home Account,More Popular Account,Taxes,e-account,Particular Plus Account,Particular Account,Current Accounts
1,1170544,Mortgage,Guarantees,Medium-term deposits,Junior Account,Particular Plus Account,Home Account,Pensions,Saving Account,Securities,...,Particular Account,Taxes,Long-term deposits,Payroll,More Popular Account,e-account,Payroll Account,Direct Debit,Short-term deposits,Current Accounts
2,1170545,Guarantees,Junior Account,Saving Account,Derivada Account,Mortgage,Medium-term deposits,Particular Account,Particular Plus Account,Home Account,...,Credit Card,Taxes,Securities,e-account,More Popular Account,Payroll,Pensions2,Payroll Account,Direct Debit,Current Accounts
3,1170547,Mortgage,Guarantees,Medium-term deposits,Junior Account,Short-term deposits,Long-term deposits,Pensions,Saving Account,Pensions2,...,Securities,Loans,Payroll Account,Home Account,More Popular Account,Taxes,e-account,Particular Plus Account,Particular Account,Current Accounts
4,1170548,Mortgage,Guarantees,Medium-term deposits,Junior Account,Short-term deposits,Long-term deposits,Pensions,Saving Account,Pensions2,...,Securities,Loans,Payroll Account,Home Account,More Popular Account,Taxes,Particular Plus Account,e-account,Particular Account,Current Accounts


In [None]:
for customer in list_of_prods.keys():
    for value in customer

In [98]:
list_of_prods = {}

for customer in test.keys():
    for value in customer:
        if value == 0:
            value = 'ind_ahor_fin_ult1'
        elif value == 1:
            value = 'ind_aval_fin_ult1'
        elif value == 2:
            value = 'ind_cco_fin_ult1'
        elif value == 3:
            value = 'ind_cder_fin_ult1'
        elif value == 4:
            value = 'ind_cno_fin_ult1'
        elif value == 5:
            value = 'ind_ctju_fin_ult1'
        elif value == 6:
            value = 'ind_ctma_fin_ult1'
        elif value == 7:
            value = 'ind_ctop_fin_ult1'
        elif value == 8:
            value = 'ind_ctpp_fin_ult1'
        elif value == 9:
            value = 'ind_deco_fin_ult1'
        elif value == 10:
            value = 'ind_deme_fin_ult1'
        elif value == 11:
            value = 'ind_dela_fin_ult1'
        elif value == 12:
            value = 'ind_ecue_fin_ult1'
        elif value == 13:
            value = 'ind_fond_fin_ult1'
        elif value == 14:
            value = 'ind_hip_fin_ult1'
        elif value == 15:
            value = 'ind_plan_fin_ult1'
        elif value == 16:
            value = 'ind_pres_fin_ult1'
        elif value == 17:
            value = 'ind_reca_fin_ult1'
        elif value == 18:
            value = 'ind_tjcr_fin_ult1'
        elif value == 19:
            value = 'ind_valo_fin_ult1'
        elif value == 20:
            value = 'ind_viv_fin_ult1'
        elif value == 21:
            value = 'ind_nomina_ult1'
        elif value == 22:
            value = 'ind_nom_pens_ult1'
        elif value == 23:
            value = 'ind_recibo_ult1' 


TypeError: 'numpy.int64' object is not iterable

In [96]:
newlist = []
for value in list_of_prods[15889]:
    if value == 0:
        value = 'ind_ahor_fin_ult1'
    elif value == 1:
        value = 'ind_aval_fin_ult1'
    elif value == 2:
        value = 'ind_cco_fin_ult1'
    elif value == 3:
        value = 'ind_cder_fin_ult1'
    elif value == 4:
        value = 'ind_cno_fin_ult1'
    elif value == 5:
        value = 'ind_ctju_fin_ult1'
    elif value == 6:
        value = 'ind_ctma_fin_ult1'
    elif value == 7:
        value = 'ind_ctop_fin_ult1'
    elif value == 8:
        value = 'ind_ctpp_fin_ult1'
    elif value == 9:
        value = 'ind_deco_fin_ult1'
    elif value == 10:
        value = 'ind_deme_fin_ult1'
    elif value == 11:
        value = 'ind_dela_fin_ult1'
    elif value == 12:
        value = 'ind_ecue_fin_ult1'
    elif value == 13:
        value = 'ind_fond_fin_ult1'
    elif value == 14:
        value = 'ind_hip_fin_ult1'
    elif value == 15:
        value = 'ind_plan_fin_ult1'
    elif value == 16:
        value = 'ind_pres_fin_ult1'
    elif value == 17:
        value = 'ind_reca_fin_ult1'
    elif value == 18:
        value = 'ind_tjcr_fin_ult1'
    elif value == 19:
        value = 'ind_valo_fin_ult1'
    elif value == 20:
        value = 'ind_viv_fin_ult1'
    elif value == 21:
        value = 'ind_nomina_ult1'
    elif value == 22:
        value = 'ind_nom_pens_ult1'
    elif value == 23:
        value = 'ind_recibo_ult1'   
    newlist.append(value)
        

In [89]:
list_of_prods[15889]

array([14,  1, 10,  5,  9, 11, 15,  0, 22, 21,  3, 18, 13, 23, 19, 16,  4,
       20,  6, 17, 12,  8,  7,  2], dtype=int64)

In [72]:
product_name = {}
for customer in list_of_prods.items():
    for value in list_of_prods[customer]:
        if value == 0:

TypeError: unhashable type: 'numpy.ndarray'

In [None]:
X = df[used_cols].values
y = df['ind_ahor_fin_ult1'].values

encoder = TargetEncoder()
X = encoder.fit_transform(X, y)
model = LogisticRegression()
model.fit(X, y)