In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb

from collections import defaultdict

%matplotlib inline
pd.set_option('display.max_columns', None)

In [None]:
train = pd.read_csv('../data/processed/train-processed.csv.gz', skipinitialspace = True)

In [None]:
test = pd.read_csv('../data/processed/test-processed.csv.gz', skipinitialspace = True)

In [None]:
print(len(train.ncodpers.unique()))
train.head()

In [None]:
print(len(test.ncodpers.unique()))
test.head()

In [None]:
features = ['ncodpers', 'sexo', 'age', 'ind_nuevo', 'antiguedad', 
            'indrel', 'indresi', 'indext', 'ind_actividad_cliente', 'renta', 
            'ind_empleado_A', 'ind_empleado_B', 'ind_empleado_F', 'ind_empleado_N', 
            'ind_empleado_S', 'indrel_1mes_1.0', 'indrel_1mes_2.0', 'indrel_1mes_3.0', 
            'indrel_1mes_4.0', 'indrel_1mes_5.0', 'indrel_1mes_6.0', 'tiprel_1mes_A', 
            'tiprel_1mes_I', 'tiprel_1mes_N', 'tiprel_1mes_P', 'tiprel_1mes_R']

products = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
            'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
            'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
            'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
            'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
            'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
            'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
            'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']

In [None]:
def process_data(df, cust_dict):
    x_vars_list = []
    y_vars_list = []

    for idx, row in df.iterrows():

        cust_id = int(row['ncodpers'])

        if row['fecha_dato'] in ['2015-05-28', '2016-05-28']:   
            target_list = row[products].values
            cust_dict[cust_id] =  target_list[:]
            continue

        x_vars = row[features].values

        if row['fecha_dato'] == '2016-06-28':
            prev_target_list = cust_dict.get(cust_id, [0]*len(products))
            x_vars_list.append(np.concatenate((x_vars, prev_target_list)))
        elif row['fecha_dato'] == '2015-06-28': # Basically it saves only those clients who bought something new in a last year for same month.
            prev_target_list = cust_dict.get(cust_id, [0]*len(products))
            target_list = row[products].values
            new_products = [max(x1 - x2,0) for (x1, x2) in zip(target_list, prev_target_list)]
            if sum(new_products) > 0:
                for ind, prod in enumerate(new_products):
                    if prod>0:
                        assert len(prev_target_list) == len(products)
                        x_vars_list.append(np.concatenate((x_vars, prev_target_list)))
                        y_vars_list.append(ind)
    
    return x_vars_list, y_vars_list, cust_dict

In [None]:
X_train, y_train, cust_dict = process_data(train, {})

In [None]:
X_test, y_test, cust_dict = process_data(test, cust_dict)

In [None]:
def runXGB(train_X, train_y, seed_val=123):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.05
    param['max_depth'] = 4
    param['silent'] = 1
    param['num_class'] = 22
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 2
    param['subsample'] = 0.9
    param['colsample_bytree'] = 0.9
    param['seed'] = seed_val
    num_rounds = 190

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)
    model = xgb.train(plst, xgtrain, num_rounds)    
    return model

In [None]:
print("Building model..")
model = runXGB(np.array(X_train), np.array(y_train), seed_val=0)
print("Predicting..")
xgtest = xgb.DMatrix(np.array(X_test))
preds = model.predict(xgtest)

In [None]:
print("Getting the top products..")
products = np.array(products)
preds = np.argsort(preds, axis=1)
preds = np.fliplr(preds)[:,:8]
test_id = np.array(test.ncodpers)

In [None]:
final_preds = [" ".join(list(products[pred])) for pred in preds]
out_df = pd.DataFrame({'ncodpers':test_id, 'added_products':final_preds})

In [None]:
out_df.to_csv('../data/predicted/pred.csv.gz', index=False, compression='gzip')