In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import gc; gc.enable()

from utilities import DfLowMemory, DfLowMemoryTest
from utilities import CleanData

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from category_encoders import WOEEncoder, TargetEncoder
from xgboost import XGBClassifier
from imxgboost.imbalance_xgb import imbalance_xgboost as imb_xgb

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

In [3]:
# reduce the memory of the dataset
df_train = DfLowMemory('train_ver2.csv')

In [4]:
# cleaning the data set so that there are no missing values
df_train = CleanData(df_train)

In [5]:
#Selecting only a few months as proposed by BreakfastPirate
df_train_may15 = df_train[df_train['fecha_dato'] == '2015-05-28']
df_train_June15 = df_train[df_train['fecha_dato'] == '2015-06-28']
df_train_may16 = df_train[df_train['fecha_dato'] == '2016-05-28']

In [6]:
df = pd.concat([df_train_may15, df_train_June15, df_train_may16])

In [7]:
df.drop_duplicates(subset='ncodpers',keep='last', inplace=True)

In [8]:
targets = [c for c in df.columns.tolist() if '_ult1' in c]

In [9]:
# there are the columns that are going to be used to train the model (feature columns)
used_cols = [c for c in df.columns.tolist() if c not in ['ncodpers', 'fecha_dato', 'fecha_alta',
'ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1','ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1','ind_ctma_fin_ult1',
 'ind_ctop_fin_ult1','ind_ctpp_fin_ult1','ind_deco_fin_ult1','ind_deme_fin_ult1','ind_dela_fin_ult1','ind_ecue_fin_ult1','ind_fond_fin_ult1',
 'ind_hip_fin_ult1','ind_plan_fin_ult1','ind_pres_fin_ult1','ind_reca_fin_ult1','ind_tjcr_fin_ult1','ind_valo_fin_ult1','ind_viv_fin_ult1',
 'ind_nomina_ult1', 'ind_nom_pens_ult1','ind_recibo_ult1']]

In [10]:
used_cols = [c for c in df.columns.tolist() if c not in ['ncodpers', 'fecha_dato', 'fecha_alta']]

## Model Training

In [11]:
# isntantiating a dictionary to put the model results
id_preds = defaultdict(list)

#getting the ids of the customers in the dataset
ids = df['ncodpers'].values

In [119]:
predictions = []
# This is just showing the training of the first 24 products. In the future I am going to train all 24 products
for target in targets[:7]:
    X = df[used_cols].values
    y = df[target]

    encoder = TargetEncoder()
    X = encoder.fit_transform(X, y)
    
    train_size = int(len(X) * 0.5)
    y_train = int(len(y) * 0.5)
    X_train, X_test = X[0:train_size], X[train_size:]
    y_train, y_test = y[0:train_size], y[train_size:]
    
    model = imb_xgb(special_objective='focal', focal_gamma=2.0, 
                    num_round=50, max_depth=6, eta=0.1)
    model.fit(X_train.values, y_train.values)
    
    #X_test = df[used_cols].values
    X_test = encoder.transform(X_test.values)
    
#     y_pred = model.predict_proba(X_test.values)[:,1]
    y_pred = model.predict_two_class(X_test.values)[:,1] 
    
#     for i, cust_id in enumerate(ids):
#         id_preds[cust_id].append(y_pred[i])

    # for each target in targets, predict for the customer-base
    # and collect the predicted probabilities
    predictions.append(y_pred)

In [None]:
predictions = np.array(predictions).T

In [None]:
predictions

In [87]:
df_test = df[train_size:]

In [88]:
df_test = df_test.iloc[:,24:]

In [61]:
df[train_size:].shape

(281638, 48)

In [112]:
i=0
for target in targets[:7]:
    print(target)
    print((df_test[target], predictions[:,i]))
    i+=1

ind_ahor_fin_ult1


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [116]:
df[df['ind_ahor_fin_ult1'] == 1]

Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
3482710,2015-06-28,241755,N,ES,H,44,2001-04-23,0.0,182,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
3586573,2015-06-28,177716,N,ES,V,44,2000-05-31,0.0,182,1.0,...,0,0,0,1,0,0,0,0.0,0.0,0
3711491,2015-06-28,508045,N,ES,V,52,2004-12-23,0.0,127,1.0,...,0,0,0,1,0,0,0,0.0,0.0,1
12742215,2016-05-28,635336,N,ES,V,49,2007-10-02,0.0,117,1.0,...,0,1,0,0,1,0,0,0.0,0.0,0
12777407,2016-05-28,525240,N,ES,V,43,2005-03-28,0.0,134,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
12796394,2016-05-28,508933,N,ES,H,54,2004-12-29,0.0,137,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
12796396,2016-05-28,508930,N,ES,V,57,2004-12-29,0.0,137,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
12796602,2016-05-28,508437,N,ES,V,60,2004-12-27,0.0,137,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
12799767,2016-05-28,500232,N,ES,H,64,2004-11-12,0.0,138,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
12800200,2016-05-28,501762,N,ES,H,54,2004-11-15,0.0,138,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0


In [117]:
train_size

657154