In [3]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import linear_model

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error

In [13]:
train = pd.read_csv("D:/Kaggle/black-friday/BlackFriday_train.csv")
test = pd.read_csv("D:/Kaggle/black-friday/BlackFriday_test.csv")

In [15]:
# Combine test and train to do encoding of categorical variables
frames = [train, test]
input = pd.concat(frames)

print(input.shape)
input.head()

(150000, 12)


Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1004016,P00123742,M,51-55,0,B,1,0,11,,,4642
1,1002322,P00102342,F,55+,13,A,2,1,8,14.0,17.0,6085
2,1001726,P00127642,M,26-35,2,B,1,1,1,2.0,15.0,15553
3,1005226,P00339542,M,36-45,0,B,1,0,1,,,11868
4,1001902,P00037142,M,26-35,19,C,1,0,1,2.0,5.0,19378


In [16]:
#Check for data types
input.dtypes

User_ID                         int64
Product_ID                     object
Gender                         object
Age                            object
Occupation                      int64
City_Category                  object
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
dtype: object

In [17]:
#Replace misising values with 999

input.fillna(999, inplace = True)

In [18]:
input.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1004016,P00123742,M,51-55,0,B,1,0,11,999.0,999.0,4642
1,1002322,P00102342,F,55+,13,A,2,1,8,14.0,17.0,6085
2,1001726,P00127642,M,26-35,2,B,1,1,1,2.0,15.0,15553
3,1005226,P00339542,M,36-45,0,B,1,0,1,999.0,999.0,11868
4,1001902,P00037142,M,26-35,19,C,1,0,1,2.0,5.0,19378


In [19]:
#Create a target variable
target = input.Purchase
target = np.array(target)

In [20]:
# Drop the purchase column from input
input.drop(["Purchase"], axis=1, inplace=True)

In [21]:
print(input.columns, input.dtypes)

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3'],
      dtype='object') User_ID                         int64
Product_ID                     object
Gender                         object
Age                            object
Occupation                      int64
City_Category                  object
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
dtype: object


In [22]:
#Convert all columns to string
input  = input.applymap(str)
input.dtypes

User_ID                       object
Product_ID                    object
Gender                        object
Age                           object
Occupation                    object
City_Category                 object
Stay_In_Current_City_Years    object
Marital_Status                object
Product_Category_1            object
Product_Category_2            object
Product_Category_3            object
dtype: object

In [23]:
# Make a copy of the data frame for reference
input_pd = input.copy

In [24]:
#Convert categorical to numeric using the labelEncoder

input = np.array(input)

for i in range(input.shape[1]):
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(input[:,i]))
    input[:,i] = lbl.transform(input[:,i])

In [25]:
input = input.astype(int)

## Modeling

In [26]:
#Splitting the dataset into two part, Creating Meta features to feed into the model

first_stage_rows = np.random.randint(train.shape[0], size = np.int(train.shape[0]/2))

In [27]:
train_np   = input[:train.shape[0], :]
target_np  = target[:train.shape[0]]
train_fs   = train_np[first_stage_rows, :]
target_fs  = target_np[first_stage_rows]
train_ss   = train_np[-first_stage_rows, :]
target_ss  = target_np[-first_stage_rows]

In [35]:
print(train_fs.shape, target_fs.shape, train_ss.shape, target_ss.shape)

(52490, 11) (52490,) (52490, 11) (52490,)


In [39]:
# Training XGBoost models

xgtrain = xgb.DMatrix(train_fs, label = target_fs)
watchlist = [(xgtrain, 'train')]

#=========MODEL 1(6/3000)===============

params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 6
params["nthread"] = 6
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 3000

model_1 = xgb.train(plst, xgtrain, num_rounds)

#=========MODEL 2(8/1420)===============

params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 8
params["nthread"] = 6
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 1420

model_2 = xgb.train(plst, xgtrain, num_rounds)

#=========MODEL 3(10/1200)===============

params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 10
params["nthread"] = 6
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 1200

model_3 = xgb.train(plst, xgtrain, num_rounds)

#=========MODEL 4(12/800)===============

params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 12
params["nthread"] = 6
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 800

model_4 = xgb.train(plst, xgtrain, num_rounds)

In [42]:
# Training ExtraTrees models

#=================MODEL 5(8/1450)===============

model_5 = ExtraTreesRegressor(n_estimators=1450,
                              max_depth=8,
                             min_samples_split=10,
                             min_samples_leaf=10,
                             oob_score=True,
                             n_jobs=6,
                             random_state=123,
                             verbose=1,
                             bootstrap=True)
model_5.fit(train_fs, target_fs)

#=================MODEL 6(6/3000)===============

model_6 = ExtraTreesRegressor(n_estimators=3000,
                              max_depth=6,
                             min_samples_split=10,
                             min_samples_leaf=10,
                             oob_score=True,
                             n_jobs=6,
                             random_state=123,
                             verbose=1,
                             bootstrap=True)
model_6.fit(train_fs, target_fs)

#=================MODEL 7(12/800)===============

model_7 = ExtraTreesRegressor(n_estimators=800,
                              max_depth=12,
                             min_samples_split=10,
                             min_samples_leaf=10,
                             oob_score=True,
                             n_jobs=6,
                             random_state=123,
                             verbose=1,
                             bootstrap=True)
model_7.fit(train_fs, target_fs)

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.6s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    2.9s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:    6.8s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:   12.3s
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:   19.3s
[Parallel(n_jobs=6)]: Done 1450 out of 1450 | elapsed:   22.5s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.4s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    2.3s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:    5.6s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:    9.3s
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:   14.0s
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed:   19.8s
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed:   25.7s
[Parallel(n_jobs=6)]: Done 3000 out of 3000 | elapsed:   30.7s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.5s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    2.6

ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=12,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=10, min_samples_split=10,
          min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=6,
          oob_score=True, random_state=123, verbose=1, warm_start=False)

In [45]:
# Training Random forest models

#==================MODEL 8(6/3000)=======================
model_8 = RandomForestRegressor(n_estimators = 3000,
                               max_depth = 6,
                               oob_score = True,
                               n_jobs = 6,
                               random_state = 123,
                               min_samples_split = 10,
                               min_samples_leaf = 10)
model_8.fit(train_fs,target_fs)

#==================MODEL 9(8/1500)=======================
model_9 = RandomForestRegressor(n_estimators = 1500,
                               max_depth = 8,
                               oob_score = True,
                               n_jobs = 6,
                               random_state = 123,
                               min_samples_split = 10,
                               min_samples_leaf = 10)
model_9.fit(train_fs,target_fs)

#==================MODEL 10(12/800)=======================
model_10 = RandomForestRegressor(n_estimators = 800,
                               max_depth = 12,
                               oob_score = True,
                               n_jobs = 6,
                               random_state = 123,
                               min_samples_split = 10,
                               min_samples_leaf = 10)
model_10.fit(train_fs,target_fs)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=10, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=6,
           oob_score=True, random_state=123, verbose=0, warm_start=False)

In [46]:
# Making predictions for each model

model_1_predict = model_1.predict(xgb.DMatrix(train_ss))
model_2_predict = model_2.predict(xgb.DMatrix(train_ss))
model_3_predict = model_3.predict(xgb.DMatrix(train_ss))
model_4_predict = model_4.predict(xgb.DMatrix(train_ss))
model_5_predict = model_5.predict(train_ss)
model_6_predict = model_6.predict(train_ss)
model_7_predict = model_7.predict(train_ss)
model_8_predict = model_8.predict(train_ss)
model_9_predict = model_9.predict(train_ss)
model_10_predict = model_10.predict(train_ss)

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:    0.4s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:    0.8s
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:    1.4s
[Parallel(n_jobs=6)]: Done 1450 out of 1450 | elapsed:    1.6s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:    0.3s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:    0.7s
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:    1.0s
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed:    1.5s
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed:    2.0s
[Parallel(n_jobs=6)]: Done 3000 out of 3000 | elapsed:    2.5s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    0.4

In [48]:
train_ss_w_meta = np.concatenate((train_ss, np.stack((model_1_predict,
                                                     model_2_predict,
                                                     model_3_predict,
                                                     model_4_predict,
                                                     model_5_predict,
                                                     model_6_predict,
                                                     model_7_predict,
                                                     model_8_predict,
                                                     model_9_predict,
                                                     model_10_predict)).T), axis=1)
pd.DataFrame(train_ss_w_meta)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,2301.0,3123.0,1.0,2.0,15.0,0.0,0.0,1.0,0.0,9.0,...,9211.833984,9321.695312,10905.108398,8877.557617,12823.530336,13013.326976,12449.824778,12776.026992,12981.751519,12591.159139
1,4278.0,2240.0,1.0,2.0,8.0,1.0,2.0,1.0,0.0,12.0,...,15732.916016,15257.666992,15837.817383,15485.412109,13059.628051,13226.917680,12708.897428,12797.613250,12456.015237,12373.051098
2,1634.0,1567.0,1.0,3.0,4.0,1.0,1.0,0.0,16.0,17.0,...,6837.556152,6843.789551,7126.532227,7208.062988,7468.688145,7567.623321,7395.425708,7489.406573,7197.167355,7410.280266
3,3938.0,1675.0,1.0,3.0,14.0,1.0,1.0,0.0,16.0,17.0,...,7208.134766,7569.685547,6757.405762,6616.833008,7424.367674,7552.905887,7237.383349,7489.406573,7190.319143,7259.513314
4,756.0,1010.0,1.0,2.0,15.0,0.0,2.0,0.0,10.0,7.0,...,14029.142578,14896.366211,14205.594727,14883.035156,12008.457093,10908.879807,13346.242529,11417.464446,13328.363074,13756.130007
5,5297.0,1973.0,0.0,1.0,9.0,2.0,4.0,0.0,13.0,4.0,...,7529.747070,7682.332031,7087.564941,7069.992676,6451.968037,6608.003623,6450.035297,6383.546446,6300.488622,6567.448905
6,5481.0,3401.0,0.0,3.0,1.0,0.0,3.0,1.0,13.0,2.0,...,6915.801758,6904.805664,6971.987305,7105.382812,6354.568776,6643.793505,6382.974110,6282.702973,6025.855346,5493.221310
7,675.0,1399.0,1.0,1.0,15.0,0.0,1.0,0.0,13.0,13.0,...,9499.880859,9030.796875,8985.521484,9461.456055,6594.870926,7001.121933,6678.097076,6282.702973,6137.115718,6188.666170
8,4127.0,1871.0,1.0,4.0,3.0,1.0,0.0,0.0,13.0,15.0,...,5616.469727,5422.739746,6192.755371,4981.349609,6375.075065,6589.330919,6347.835266,6283.064291,6141.467449,6406.304457
9,402.0,3308.0,1.0,3.0,7.0,0.0,1.0,1.0,13.0,17.0,...,9005.103516,9756.942383,10733.188477,9754.517578,6195.089346,6462.196344,6076.933506,6282.702973,6138.604695,6023.047592


In [50]:
# Second stage model with meta features
kfolds = KFold(train_ss_w_meta.shape[0], n_folds=5)

In [51]:
params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 8
params["nthread"] = 6
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 1400

In [53]:
for train_index, validation_index in kfolds:
    
    train_X, validation_X = train_ss_w_meta[train_index, :], train_ss_w_meta[validation_index, :]
    train_y, validation_y = target_ss[train_index], target_ss[validation_index]
    
    xgtrain = xgb.DMatrix(train_X, label = train_y)
    watchlist = [(xgtrain, 'train')]
    model_cv_xgboost = xgb.train(plst, xgtrain, num_rounds)
    model_cv_predict = model_cv_xgboost.predict(xgb.DMatrix(validation_X))
    print(np.sqrt(mean_squared_error(validation_y, model_cv_predict)))

1976.3454516207073
1915.3466697829156
1950.1230766336696
1942.3694469653037
1896.778532696168


In [54]:
# Training second model on all the second stage data

xgtrain = xgb.DMatrix(train_ss_w_meta, label=target_ss)
watchlist = [(xgtrain, 'train')]
model_ss_xgboost = xgb.train(plst, xgtrain, num_rounds)

## Creating final prediction on test dataset

In [55]:
model_1_predict = model_1.predict(xgb.DMatrix(input[train.shape[0]:,:]))
model_2_predict = model_2.predict(xgb.DMatrix(input[train.shape[0]:,:]))
model_3_predict = model_3.predict(xgb.DMatrix(input[train.shape[0]:,:]))
model_4_predict = model_4.predict(xgb.DMatrix(input[train.shape[0]:,:]))

model_5_predict = model_5.predict(input[train.shape[0]:, :])
model_6_predict = model_6.predict(input[train.shape[0]:, :])
model_7_predict = model_7.predict(input[train.shape[0]:, :])
model_8_predict = model_8.predict(input[train.shape[0]:, :])
model_9_predict = model_9.predict(input[train.shape[0]:, :])
model_10_predict = model_10.predict(input[train.shape[0]:, :])

test_ss_w_meta = np.concatenate((input[train.shape[0]:, :], 
                                 np.vstack((model_1_predict, 
                                            model_2_predict, 
                                            model_3_predict, 
                                            model_4_predict, 
                                            model_5_predict,
                                            model_6_predict, 
                                            model_7_predict, 
                                            model_8_predict, 
                                            model_9_predict, 
                                            model_10_predict)).T), axis=1)

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:    0.2s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:    0.3s
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:    0.6s
[Parallel(n_jobs=6)]: Done 1450 out of 1450 | elapsed:    0.7s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:    0.2s
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:    0.4s
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed:    0.6s
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed:    0.9s
[Parallel(n_jobs=6)]: Done 3000 out of 3000 | elapsed:    1.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    0.1

In [56]:
# Training second model for predictions on test meta data
model_ss_predict = model_ss_xgboost.predict(xgb.DMatrix(test_ss_w_meta))

In [57]:
np.max(model_ss_predict), np.min(model_ss_predict)

(24863.785, -80.53052)

## Cross-validation for XGBoost

In [58]:
kfolds = KFold(train_np.shape[0], n_folds=5)

params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 8
params["nthread"] = 6
#params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 1400

for train_index, validation_index in kfolds:
    
    train_X, validation_X = train_np[train_index, :], train_np[validation_index, :]
    train_y, validation_y = target_np[train_index], target_np[validation_index]
    
    xgtrain = xgb.DMatrix(train_X, label=train_y)
    watchlist = [(xgtrain, 'train')]
    model_cv_xgboost = xgb.train(plst, xgtrain, num_rounds)
    model_cv_predict = model_cv_xgboost.predict(xgb.DMatrix(validation_X))
    print(np.sqrt(mean_squared_error(validation_y, model_cv_predict)))

2680.5890613622487
2672.2119993600118
2696.1701088553245
2665.230881753485
2677.1519593946678


## Checking with tf-idf (Ridge and Lasso)

In [59]:
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(train_np)
train_tfidf = tfidf.toarray()

In [61]:
# For Ridge

kfolds = KFold(train_np.shape[0], n_folds=5)

for train_index, validation_index in kfolds:
    
    train_X, validation_X = train_tfidf[train_index, :], train_tfidf[validation_index, :]
    train_y, validation_y = target_np[train_index], target_np[validation_index]
    model_ridge = linear_model.Ridge(alpha=0.01)
    model_ridge.fit(train_X, train_y)
    predict_ridge = model_ridge.predict(validation_X)
    print(np.sqrt(mean_squared_error(validation_y, predict_ridge)))

4833.71148192883
4806.073200289471
4788.27855535358
4802.316665189875
4766.456613205812


In [62]:
# For Lasso

kfolds = KFold(train_np.shape[0], n_folds=5)

for train_index, validation_index in kfolds:
    
    train_X, validation_X = train_tfidf[train_index, :], train_tfidf[validation_index, :]
    train_y, validation_y = target_np[train_index], target_np[validation_index]
    model_lasso = linear_model.Lasso(alpha=0.01, max_iter=10000)
    model_lasso.fit(train_X, train_y)
    predict_lasso = model_lasso.predict(validation_X)
    predict_lasso = predict_lasso
    print(np.sqrt(mean_squared_error(validation_y, predict_lasso)))

4833.463218011147
4805.286890183258
4785.492524229396
4799.246539848489
4768.959524827607
