In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer, mean_squared_error
import xgboost as xgb

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
#preprocess data
def categorical_to_numerical(train, test):    
    train_ = train.copy()
    test_ = test.copy()  
    encoders_train = {}
    encoders_test = {} 
    for column in train.columns[:-1]:
        if train[column].dtype == np.object:
            label = LabelEncoder()
            encoders_train[column] = label.fit_transform(train[column])
            train_[column] = label.transform(train[column])
            
            label = LabelEncoder()
            encoders_test[column] = label.fit_transform(test[column])
            test_[column] = label.transform(test[column])
    train_.fillna(0,inplace=True)
    test_.fillna(0,inplace=True)   
    return train_, test_, encoders_train, encoders_test

In [4]:
#calculate rmse
def RMSE(true_y, preds):
    return np.sqrt(mean_squared_error(true_y, preds))
rmse_scorer = make_scorer(RMSE, greater_is_better=False) #create own scorer

In [5]:
# get file with predictions
def get_file(test,predictions, file_name):
    sample = pd.read_csv('Sample_Submission_Tm9Lura.csv')
    sample['User_ID'] = test.User_ID.values
    sample['Product_ID'] = test.Product_ID.values
    sample['Purchase'] = predictions
    sample.to_csv(file_name, index=False)

In [6]:
#read test and train sets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
#preprocess it
train_, test_, encoder_train, encoder_test = categorical_to_numerical(train, test)
#get own train and test sets
X_train = train_.drop(['Purchase'],axis=1)
y_train = train_.Purchase
train_X, test_X,train_y, test_y = train_test_split(X_train, y_train,random_state=100, test_size = 0.2)

In [9]:
#modelling
xgb_train = xgb.DMatrix(train_X, label=train_y)

In [16]:
params = {'objective': 'reg:linear', 'booster': 'gbtree','max_depth': 8, 'eta': 0.1, 'nthread': -1,
          'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 20,'max_delta_step': 0, 'gamma': 0,
          'silent': 1}

In [17]:
num_trees = 1000

In [18]:
seeds = [1122, 2244, 3366, 4488, 5500,6500,7500,9000,10000,15000,20000,25000,35000,40000,45000,50000]
preds = np.zeros((len(test_X), len(seeds)))

In [19]:
%%time
for i in range(len(seeds)):
    params['seed'] = seeds[i]
    model = xgb.train(params, xgb_train, num_trees)
    dt = xgb.DMatrix(test_X)
    preds[:, i] = model.predict(dt)

preds = np.mean(preds, axis=1) #get mean of predictions on test data

CPU times: user 2h 6min 29s, sys: 1.83 s, total: 2h 6min 31s
Wall time: 2h 6min 31s


In [20]:
#score for the own test set
RMSE(preds,test_y)

2485.9951798447546

In [27]:
preds_test = np.zeros((len(test_), len(seeds)))

In [24]:
xgb_train1 = xgb.DMatrix(X_train, label=y_train)

In [28]:
%%time
for i in range(len(seeds)):
    params['seed'] = seeds[i]
    model = xgb.train(params, xgb_train1, num_trees)
    dt = xgb.DMatrix(test_)
    preds_test[:, i] = model.predict(dt)

preds_test = np.mean(preds_test, axis=1) #get mean of predictions to submit

CPU times: user 2h 31min 40s, sys: 292 ms, total: 2h 31min 40s
Wall time: 2h 31min 41s


In [30]:
get_file(test,preds_test,'xgb.csv') #score 2783.4229883237 on public leaderboard