In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import gc; gc.enable()

from utilities import DfLowMemory, DfLowMemoryTest
from utilities import CleanData, CleanTestData

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from category_encoders import WOEEncoder, TargetEncoder
from xgboost import XGBClassifier
from imxgboost.imbalance_xgb import imbalance_xgboost as imb_xgb

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = DfLowMemory('train_ver2.csv')

In [3]:
df_train = CleanData(df_train)

In [4]:
df_test = pd.read_csv('test_data.csv')

In [5]:
df_test = CleanTestData(df_test)

In [6]:
df_train_may15 = df_train[df_train['fecha_dato'] == '2015-05-28']
df_train_June15 = df_train[df_train['fecha_dato'] == '2015-06-28']
df_train_may16 = df_train[df_train['fecha_dato'] == '2016-05-28']

In [7]:
df = pd.concat([df_train_may15, df_train_June15, df_train_may16])

In [8]:
df.drop_duplicates(subset='ncodpers',keep='last', inplace=True)

In [9]:
targets = [c for c in df.columns.tolist() if '_ult1' in c]

In [10]:
used_cols = [c for c in df.columns.tolist() if c not in ['ncodpers', 'fecha_dato', 'fecha_alta',
'ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1','ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1','ind_ctma_fin_ult1',
 'ind_ctop_fin_ult1','ind_ctpp_fin_ult1','ind_deco_fin_ult1','ind_deme_fin_ult1','ind_dela_fin_ult1','ind_ecue_fin_ult1','ind_fond_fin_ult1',
 'ind_hip_fin_ult1','ind_plan_fin_ult1','ind_pres_fin_ult1','ind_reca_fin_ult1','ind_tjcr_fin_ult1','ind_valo_fin_ult1','ind_viv_fin_ult1',
 'ind_nomina_ult1', 'ind_nom_pens_ult1','ind_recibo_ult1']]

In [11]:
used_test_cols = used_cols = [c for c in df_test.columns.tolist() if c not in ['ncodpers', 'fecha_dato', 'fecha_alta']]

## Model Training

In [15]:
id_preds = defaultdict(list)
ids = df_test['ncodpers'].values

In [16]:
for target in targets:
    X = df[used_cols].values
    y = df[target]

    encoder = TargetEncoder()
    X = encoder.fit_transform(X, y)
    model = imb_xgb(special_objective='focal', focal_gamma=2.0)
    model.fit(X.values, y.values)
    
    X_test = df[used_test_cols].values
    X_test = encoder.transform(X_test)
    
    y_pred = model.predict_sigmoid(X_test.values)
    
    for i, cust_id in enumerate(ids):
        id_preds[cust_id].append(y_pred[i])

In [33]:
preds =dict(id_preds)

In [34]:
list_of_prods = {}

for customer in preds.keys():
    list_of_prods[customer] = np.argsort(preds[customer])

In [35]:
target_names = ['Saving Account', 'Guarantees', 'Current Accounts', 'Derivada Account', 'Payroll Account', 'Junior Account',
               'More Popular Account', 'Particular Account', 'Particular Plus Account', 'Short-term deposits',
               'Medium-term deposits', 'Long-term deposits', 'e-account', 'Funds', 'Mortgage', 'Pensions', 'Loans', 'Taxes',
               'Credit Card', 'Securities', 'Home Account', 'Payroll', 'Pensions2', 'Direct Debit']

In [36]:
target_dict_names = {}

for i, target in enumerate(target_names):
     target_dict_names[i] = target

In [37]:
user_frame = pd.DataFrame(list(list_of_prods), columns=['user'])

In [39]:
user_frame['prods'] = user_frame['user'].map(list_of_prods)

In [40]:
trial_frame = user_frame[:100000]

In [41]:
a = trial_frame['prods'].apply(lambda x: pd.Series(x).map(target_dict_names))

In [42]:
a['user_id'] = user_frame['user'][:100000]

In [43]:
cols = list(a)
# move the column to head of list using index, pop and insert
cols.insert(0, cols.pop(cols.index('user_id')))
a = a.ix[:, cols]