In [1]:
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb

from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV
from collections import defaultdict

%matplotlib inline
pd.set_option('display.max_columns', None)

In [2]:
train = pd.read_csv('../data/processed/train-processed.csv.gz', skipinitialspace = True)

In [3]:
test = pd.read_csv('../data/processed/test-processed.csv.gz', skipinitialspace = True)

In [4]:
train.head()

Unnamed: 0,fecha_dato,ncodpers,ind_empleado,sexo,age,ind_nuevo,antiguedad,indrel,indrel_1mes,tiprel_1mes,indresi,indext,cod_prov,ind_actividad_cliente,renta,segmento,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
0,2015-05-28,15889,3.0,1.0,4,0.0,245.0,1.0,0.0,0.0,1.0,0.0,39.0,1.0,4.0,0.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0.0,0.0,0
1,2015-06-28,15889,3.0,1.0,4,0.0,245.0,1.0,0.0,0.0,1.0,0.0,39.0,1.0,4.0,0.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0.0,0.0,0
2,2016-05-28,15889,3.0,1.0,4,0.0,255.0,1.0,0.0,0.0,1.0,0.0,39.0,1.0,4.0,0.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0.0,0.0,0
3,2015-05-28,15890,4.0,1.0,4,0.0,246.0,1.0,0.0,0.0,1.0,0.0,39.0,1.0,0.0,1.0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1.0,1.0,1
4,2015-06-28,15890,4.0,1.0,4,0.0,246.0,1.0,0.0,0.0,1.0,0.0,39.0,1.0,0.0,1.0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1.0,1.0,1


In [5]:
test.head()

Unnamed: 0,fecha_dato,ncodpers,ind_empleado,sexo,age,ind_nuevo,antiguedad,indrel,indrel_1mes,tiprel_1mes,indresi,indext,cod_prov,ind_actividad_cliente,renta,segmento
0,2016-06-28,15889,3.0,1.0,4,0,256,1,0.0,0.0,1.0,0.0,39.0,1,4.0,0.0
1,2016-06-28,1170544,0.0,0.0,3,0,34,1,0.0,2.0,1.0,0.0,5.0,0,,1.0
2,2016-06-28,1170545,0.0,1.0,1,0,34,1,0.0,0.0,1.0,0.0,48.0,1,,2.0
3,2016-06-28,1170547,0.0,0.0,1,0,34,1,0.0,2.0,1.0,0.0,22.0,0,2.0,2.0
4,2016-06-28,1170548,0.0,0.0,1,0,34,1,0.0,2.0,1.0,0.0,4.0,0,1.0,2.0


In [6]:
features = np.array(test.columns.values)

products = np.array(['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
            'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
            'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
            'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
            'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
            'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
            'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
            'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'])

In [7]:
def process_data(df, cust_dict):
    
    print("Start processing...")
    
    x_vars_list = []
    y_vars_list = []

    for idx, row in df.iterrows():

        cust_id = int(row['ncodpers'])

        if row['fecha_dato'] in ['2015-05-28', '2016-05-28']:   
            target_list = row[products].values
            cust_dict[cust_id] =  target_list[:]
            continue

        x_vars = row[features].values

        if row['fecha_dato'] == '2016-06-28':
            prev_target_list = cust_dict.get(cust_id, [0]*len(products))
            x_vars_list.append(np.concatenate((x_vars, prev_target_list)))
        elif row['fecha_dato'] == '2015-06-28': # Basically it saves only those clients who bought something new in a last year for same month.
            prev_target_list = cust_dict.get(cust_id, [0]*len(products))
            target_list = row[products].values
            new_products = [max(x1 - x2,0) for (x1, x2) in zip(target_list, prev_target_list)]
            if sum(new_products) > 0:
                for ind, prod in enumerate(new_products):
                    if prod>0:
                        assert len(prev_target_list) == len(products)
                        x_vars_list.append(np.concatenate((x_vars, prev_target_list)))
                        y_vars_list.append(ind)
                        
    print("Done!")
    
    return x_vars_list, y_vars_list, cust_dict

In [8]:
X_train, y_train, cust_dict = process_data(train, {})

Start processing...
Done!


In [9]:
X_test, y_test, cust_dict = process_data(test, cust_dict)

Start processing...
Done!


In [10]:
x_train_path = '../data/processed/x_train.pickle'
y_train_path = '../data/processed/y_train.pickle'

x_test_path = '../data/processed/x_test.pickle'
y_test_path = '../data/processed/y_test.pickle'

In [11]:
with open(x_train_path, 'wb') as f:
    pickle.dump(X_train, f)
    
with open(y_train_path, 'wb') as f:
    pickle.dump(y_train, f)   

with open(x_test_path, 'wb') as f:
    pickle.dump(X_test, f)
    
with open(y_test_path, 'wb') as f:
    pickle.dump(y_test, f)    

In [12]:
with open(x_train_path, 'rb') as f:
    X_train = np.array(pickle.load(f))

with open(y_train_path, 'rb') as f:
    y_train = np.array(pickle.load(f))

    
with open(x_test_path, 'rb') as f:
    X_test = np.array(pickle.load(f))

with open(y_test_path, 'rb') as f:
    y_test = np.array(pickle.load(f))

In [13]:
def runXGB(train_X, train_y, seed_val=123):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.05
    param['max_depth'] = 1
    param['silent'] = 1
    param['num_class'] = len(products)
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 4
    param['subsample'] = 0.9
    param['colsample_bytree'] = 0.8
    param['gamma'] = 0.2
    param['seed'] = seed_val
    num_rounds = 190
    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)
    model = xgb.train(plst, xgtrain, num_rounds)    
    return model

In [14]:
print("Building model..")
model = runXGB(np.delete(X_train, 0, axis=1), np.array(y_train), seed_val=0)
print("Predicting..")
xgtest = xgb.DMatrix(np.delete(X_test, 0, axis=1))
preds = model.predict(xgtest)

print("Done!")

Building model..
Predicting..
Done!


In [15]:
print("Getting the top products..")
products = np.array(products)
preds = np.argsort(preds, axis=1)
preds = np.fliplr(preds)[:,:8]
test_id = np.array(test.ncodpers)

final_preds = [" ".join(list(products[pred])) for pred in preds]
out_df = pd.DataFrame({'ncodpers':test_id, 'added_products':final_preds})

print("Done!")

Getting the top products..
Done!


In [18]:
out_df.to_csv('../data/predicted/pred.csv.gz', index=False, compression='gzip')