In [31]:
!pip install xgboost

import pandas as pd
import numpy as np
import xgboost as xgb

[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [32]:
%%time

np.random.seed(2018)

trn = pd.read_csv('train_ver2.csv', low_memory=False)
tst = pd.read_csv('test_ver2.csv', low_memory=False)

CPU times: user 1min 14s, sys: 9.23 s, total: 1min 24s
Wall time: 1min 24s


In [34]:
%%time

prods = trn.columns[24:].tolist()

trn[prods] = trn[prods].fillna(0.0).astype(np.int8)

no_product = trn[prods].sum(axis=1) == 0
trn = trn[~no_product]

for col in trn.columns[24:]:
    tst[col] = 0
    
df = pd.concat([trn, tst], axis=0)

features = []

categorical_cols = ['ind_empleado', 'pais_residencia', 'sexo', 'tiprel_1mes', 'indresi', 'indext',
                   'conyuemp', 'canal_entrada', 'indfall', 'tipodom', 'nomprov', 'segmento']

for col in categorical_cols:
    df[col], _ = df[col].factorize(na_sentinel=-99)
features += categorical_cols


CPU times: user 25.6 s, sys: 10.8 s, total: 36.4 s
Wall time: 36.4 s


In [35]:
%%time

df['age'].replace(' NA', -99, inplace=True)
df['age'] = df['age'].astype(np.int8)

df['antiguedad'].replace('     NA', -99, inplace=True)
df['antiguedad'] = df['antiguedad'].astype(np.int8)


CPU times: user 3.32 s, sys: 0 ns, total: 3.32 s
Wall time: 3.31 s


In [36]:
%%time
df['renta'].replace('         NA', -99, inplace=True)
df['renta'].fillna(-99, inplace=True)
df['renta'] = df['renta'].astype(float).astype(np.int8)


CPU times: user 4.3 s, sys: 0 ns, total: 4.3 s
Wall time: 4.29 s


In [37]:
%%time
df['indrel_1mes'].replace('P', 5, inplace=True)
df['indrel_1mes'].fillna(-99, inplace=True)
df['indrel_1mes'] = df['indrel_1mes'].astype(float).astype(np.int8)


CPU times: user 1.65 s, sys: 0 ns, total: 1.65 s
Wall time: 1.65 s


In [38]:
%%time

features += ['age', 'antiguedad', 'renta', 'ind_nuevo', 'indrel', 'indrel_1mes', 'ind_actividad_cliente']


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 8.34 µs


In [39]:
%%time
df['fecha_alta_month'] = df['fecha_alta'].map(lambda x:0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
df['fecha_alta_year'] = df['fecha_alta'].map(lambda x:0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
features += ['fecha_alta_month', 'fecha_alta_year']


CPU times: user 18.3 s, sys: 96 ms, total: 18.4 s
Wall time: 18.4 s


In [40]:
%%time
df['ult_fec_cli_1t_month'] = df['ult_fec_cli_1t'].map(lambda x:0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
df['ult_fec_cli_1t_year'] = df['ult_fec_cli_1t'].map(lambda x:0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
features += ['ult_fec_cli_1t_month', 'ult_fec_cli_1t_year']


CPU times: user 6.22 s, sys: 4 ms, total: 6.22 s
Wall time: 6.22 s


In [41]:
%%time
df.fillna(-99, inplace=True)

def date_to_int(str_date):
    Y, M, D = [int(a) for a in str_date.strip().split("-")]
    int_date = (int(Y) - 2015) * 12 + int(M)
    return int_date

df['int_date'] = df['fecha_dato'].map(date_to_int).astype(np.int8)

df_lag = df.copy()
df_lag.columns = [col + '_prev' if col not in ['ncodpers','int_date'] else col for col in df.columns ]
df_lag['int_date'] += 1

df_trn = df.merge(df_lag, on=['ncodpers', 'int_date'], how='left')

del df, df_lag
for prod in prods:
    prev = prod + '_prev'
    df_trn[prev].fillna(0, inplace=True)
    
df_trn.fillna(-99, inplace=True)

features += [feature + '_prev' for feature in features]
features += [prod + '_prev' for prod in prods]



CPU times: user 1min 2s, sys: 30.1 s, total: 1min 32s
Wall time: 1min 33s


In [42]:
%%time
use_dates = ['2016-01-28', '2016-02-28', '2016-03-28', '2016-04-28', '2016-05-28']
trn = df_trn[df_trn['fecha_dato'].isin(use_dates)]
tst = df_trn[df_trn['fecha_dato'] == '2016-06-28']

del df_trn

X=[]
Y=[]

for i, prod in enumerate(prods):
    prev = prod + '_prev'
    prX = trn[(trn[prod] == 1) & (trn[prev] == 0)]
    prY = np.zeros(prX.shape[0], dtype=np.int8) + i
    X.append(prX)
    Y.append(prY)
    
XY = pd.concat(X)
Y = np.hstack(Y)
XY['y'] = Y


vld_date = '2016-05-28'
XY_trn = XY[XY['fecha_dato'] != vld_date]
XY_vld = XY[XY['fecha_dato'] == vld_date]

## - TESTING


CPU times: user 3.64 s, sys: 828 ms, total: 4.47 s
Wall time: 4.47 s


In [45]:
%%time
## XXGBoost model training


param = {
    'booster': 'gbtree',
    'max_depth': 8,
    'nthread': 4,
    'num_class': len(prods),
    'objective': 'multi:softprob',
    'silent': 1,
    'eval_metric': 'mlogloss',
    
    'eta': 0.1,
    'min_child_weight': 10,
    'colsample_bytree': 0.8,
    'colsample_bylevel': 0.9,
    'seed': 2018,
}

X_trn = XY_trn.as_matrix(columns=features)
Y_trn = XY_trn.as_matrix(columns=['y'])

dtrn = xgb.DMatrix(X_trn, label=Y_trn, feature_names=features)

CPU times: user 124 ms, sys: 0 ns, total: 124 ms
Wall time: 121 ms


In [46]:
X_vld = XY_vld.as_matrix(columns=features)
Y_vld = XY_vld.as_matrix(columns=['y'])

dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)

In [48]:
%%time
watch_list = [(dtrn, 'train'), (dvld, 'eval')]
model = xgb.train(param, dtrn, num_boost_round=1000, evals=watch_list, early_stopping_rounds=20)

import pickle


[0]	train-mlogloss:2.73433	eval-mlogloss:2.74233
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:2.48344	eval-mlogloss:2.49555
[2]	train-mlogloss:2.30469	eval-mlogloss:2.31939
[3]	train-mlogloss:2.15884	eval-mlogloss:2.17532
[4]	train-mlogloss:2.03811	eval-mlogloss:2.05536
[5]	train-mlogloss:1.9436	eval-mlogloss:1.96203
[6]	train-mlogloss:1.86333	eval-mlogloss:1.88254
[7]	train-mlogloss:1.7903	eval-mlogloss:1.81012
[8]	train-mlogloss:1.73062	eval-mlogloss:1.75126
[9]	train-mlogloss:1.67667	eval-mlogloss:1.69764
[10]	train-mlogloss:1.62686	eval-mlogloss:1.64816
[11]	train-mlogloss:1.58347	eval-mlogloss:1.60491
[12]	train-mlogloss:1.54526	eval-mlogloss:1.56676
[13]	train-mlogloss:1.5097	eval-mlogloss:1.53164
[14]	train-mlogloss:1.47852	eval-mlogloss:1.50091
[15]	train-mlogloss:1.4491	eval-mlogloss:1.47187
[16]	train-mlogloss:1.42437	eval-mlogloss:1.44772
[17]	train-ml

[161]	train-mlogloss:1.00085	eval-mlogloss:1.08825
[162]	train-mlogloss:1.00026	eval-mlogloss:1.0882
[163]	train-mlogloss:0.999756	eval-mlogloss:1.08825
[164]	train-mlogloss:0.999207	eval-mlogloss:1.0882
[165]	train-mlogloss:0.998629	eval-mlogloss:1.08815
[166]	train-mlogloss:0.998034	eval-mlogloss:1.08812
[167]	train-mlogloss:0.997484	eval-mlogloss:1.08807
[168]	train-mlogloss:0.996733	eval-mlogloss:1.08804
[169]	train-mlogloss:0.996099	eval-mlogloss:1.08798
[170]	train-mlogloss:0.995525	eval-mlogloss:1.08791
[171]	train-mlogloss:0.995027	eval-mlogloss:1.08789
[172]	train-mlogloss:0.994355	eval-mlogloss:1.08792
[173]	train-mlogloss:0.993913	eval-mlogloss:1.08795
[174]	train-mlogloss:0.993255	eval-mlogloss:1.0879
[175]	train-mlogloss:0.992711	eval-mlogloss:1.08793
[176]	train-mlogloss:0.992209	eval-mlogloss:1.08791
[177]	train-mlogloss:0.991645	eval-mlogloss:1.08794
[178]	train-mlogloss:0.991089	eval-mlogloss:1.08799
[179]	train-mlogloss:0.990435	eval-mlogloss:1.08796
[180]	train-mlogl

NameError: name 'picle' is not defined

In [51]:
pickle.dump(model, open("xgb.baseline.pkl", "wb"))
best_ntree_limit = model.best_ntree_limit

In [52]:
!ls

Kaggle_Chapter2_Code2-15.ipynb				test_ver2.csv
Kaggle_Chapter2_Santander_Product_Recommendation.ipynb	test_ver2.csv.zip
labels.csv						train_ver2.csv
lost+found						train_ver2.csv.zip
notebooks						xgb.baseline.pkl
sample_submission.csv.zip
