In [35]:
import pandas as pd
import re
import gc
import numpy as np
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.externals import joblib
import xgboost as xgb

ModuleNotFoundError: No module named 'xgboost'

In [2]:
df_bids = pd.read_csv('../PROJECT/bids.csv')
df_bids.head(n=5)

Unnamed: 0,bid_id,bidder_id,auction,merchandise,device,time,country,ip,url
0,0,8dac2b259fd1c6d1120e519fb1ac14fbqvax8,ewmzr,jewelry,phone0,9759243157894736,us,69.166.231.58,vasstdc27m7nks3
1,1,668d393e858e8126275433046bbd35c6tywop,aeqok,furniture,phone1,9759243157894736,in,50.201.125.84,jmqlhflrzwuay9c
2,2,aa5f360084278b35d746fa6af3a7a1a5ra3xe,wa00e,home goods,phone2,9759243157894736,py,112.54.208.157,vasstdc27m7nks3
3,3,3939ac3ef7d472a59a9c5f893dd3e39fh9ofi,jefix,jewelry,phone4,9759243157894736,in,18.99.175.133,vasstdc27m7nks3
4,4,8393c48eaf4b8fa96886edc7cf27b372dsibi,jefix,jewelry,phone5,9759243157894736,in,145.138.5.37,vasstdc27m7nks3


In [3]:
df_bids = df_bids.replace({' ': ''}, regex = True) #remove spaces
df_bids.head(n=5)

Unnamed: 0,bid_id,bidder_id,auction,merchandise,device,time,country,ip,url
0,0,8dac2b259fd1c6d1120e519fb1ac14fbqvax8,ewmzr,jewelry,phone0,9759243157894736,us,69.166.231.58,vasstdc27m7nks3
1,1,668d393e858e8126275433046bbd35c6tywop,aeqok,furniture,phone1,9759243157894736,in,50.201.125.84,jmqlhflrzwuay9c
2,2,aa5f360084278b35d746fa6af3a7a1a5ra3xe,wa00e,homegoods,phone2,9759243157894736,py,112.54.208.157,vasstdc27m7nks3
3,3,3939ac3ef7d472a59a9c5f893dd3e39fh9ofi,jefix,jewelry,phone4,9759243157894736,in,18.99.175.133,vasstdc27m7nks3
4,4,8393c48eaf4b8fa96886edc7cf27b372dsibi,jefix,jewelry,phone5,9759243157894736,in,145.138.5.37,vasstdc27m7nks3


In [4]:
df_bids_sorted = df_bids.sort_values(['bidder_id', 'time'], ascending = [True, True])

In [5]:
#dataframe for aggregated bid data
bids = pd.DataFrame(data = df_bids_sorted['bidder_id'].unique(), columns = ['bidder_id'],
                    index = df_bids_sorted['bidder_id'].unique())

In [6]:
#auction counts                  
counts = df_bids_sorted.groupby('bidder_id')['bidder_id'].agg('count')
bids['auction_count_num'] = counts

In [7]:
timediff = df_bids_sorted.groupby('bidder_id')['time'].diff()
timediff_str = timediff.astype(str).fillna('')
df_bids_sorted['timediff_num'] = timediff
df_bids_sorted['timediff'] = timediff_str

In [8]:
#turn feature sequences into text
text_cols = ['auction', 'merchandise', 'device', 'timediff', 'country', 'ip', 'url']
for var in text_cols:
    df_bids_sorted[var] = var + "_" + df_bids_sorted[var].fillna("")
    text_str = var + '_text'
    count_str = var + '_nunique_num'
    bids[text_str] = df_bids_sorted.groupby('bidder_id')[var].apply(lambda x: "%s" % ' '.join(x))
    bids[count_str]  = df_bids_sorted.groupby('bidder_id')[var].nunique()

In [9]:
max_time = df_bids_sorted.groupby('bidder_id')['time'].max()
bids['maxtime_num'] = max_time
min_time = df_bids_sorted.groupby('bidder_id')['time'].min()
bids['mintime_num'] = min_time
max_diff = df_bids_sorted.groupby('bidder_id')['timediff_num'].max()
max_diff = max_diff.fillna(max_diff.mean())
bids['maxdiff_num'] = max_diff
min_diff = df_bids_sorted.groupby('bidder_id')['timediff_num'].max()
min_diff = min_diff.fillna(min_diff.mean())
bids['mindiff_num'] = min_diff
range_diff = max_diff - min_diff
bids['rangediff_num'] = range_diff
mean_diff = df_bids_sorted.groupby('bidder_id')['timediff_num'].mean()
mean_diff = mean_diff.fillna(mean_diff.mean())
bids['meandiff_num'] = mean_diff
median_diff = df_bids_sorted.groupby('bidder_id')['timediff_num'].median()
median_diff = median_diff.fillna(median_diff.mean())
bids['mediandiff_num'] = median_diff
for q in [0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9]:
    q_string = 'diff_quantile_num_' + str(q).replace('.', '_')
    q_temp = df_bids_sorted.groupby('bidder_id')['timediff_num'].quantile(q)
    q_temp = q_temp.fillna(q_temp.mean())
    bids[q_string] = q_temp

In [10]:
df_train = pd.read_csv('../PROJECT/train.csv')
df_test = pd.read_csv('../PROJECT/test.csv')
df_combo = df_train.append(df_test)
df_combo['address_text'] = 'address_' + df_combo['address'].fillna('')
df_combo['account_text'] = 'account_' + df_combo['payment_account'].fillna('')
df_combo = df_combo.merge(bids, how = 'left', left_on = ['bidder_id'], right_on = ['bidder_id'])

In [11]:
del df_train
del df_test
del df_bids
del df_bids_sorted
del bids
gc.collect();

In [12]:
num_cols = filter(re.compile('num').search, df_combo.columns)
text_cols = filter(re.compile('text').search, df_combo.columns)
for col in num_cols:
    print(col)
    df_combo[col] = df_combo[col].fillna(df_combo[col].mean())
for col in text_cols:
    print(col)
    df_combo[col] = df_combo[col].fillna('')

auction_count_num
auction_nunique_num
merchandise_nunique_num
device_nunique_num
timediff_nunique_num
country_nunique_num
ip_nunique_num
url_nunique_num
maxtime_num
mintime_num
maxdiff_num
mindiff_num
rangediff_num
meandiff_num
mediandiff_num
diff_quantile_num_0_1
diff_quantile_num_0_2
diff_quantile_num_0_3
diff_quantile_num_0_4
diff_quantile_num_0_6
diff_quantile_num_0_7
diff_quantile_num_0_8
diff_quantile_num_0_9
address_text
account_text
auction_text
merchandise_text
device_text
timediff_text
country_text
ip_text
url_text


In [13]:
sample = pd.read_csv('../PROJECT/sampleSubmission.csv')
test_dat = df_combo[df_combo.bidder_id.isin(sample.bidder_id)]
#test
print(sample.bidder_id.values==test_dat['bidder_id'].values)

[ True  True  True ...,  True  True  True]


In [30]:
train_dat

Unnamed: 0,address,bidder_id,outcome,payment_account,address_text,account_text,auction_count_num,auction_text,auction_nunique_num,merchandise_text,...,meandiff_num,mediandiff_num,diff_quantile_num_0_1,diff_quantile_num_0_2,diff_quantile_num_0_3,diff_quantile_num_0_4,diff_quantile_num_0_6,diff_quantile_num_0_7,diff_quantile_num_0_8,diff_quantile_num_0_9
0,a3d2de7675556553a5f08e4c88d2c228vt0u4,91a3c57b13234af24875c56fb7e2b2f4rb56a,0.0,a3d2de7675556553a5f08e4c88d2c228754av,address_a3d2de7675556553a5f08e4c88d2c228vt0u4,account_a3d2de7675556553a5f08e4c88d2c228754av,24.0,auction_yitr4 auction_btpyy auction_kj2ko auct...,18.0,merchandise_homegoods merchandise_homegoods me...,...,5.711121e+11,3.458421e+11,8.351579e+10,1.008947e+11,1.520105e+11,2.575368e+11,3.701789e+11,5.227789e+11,8.321368e+11,1.299937e+12
1,ae87054e5a97a8f840a3991d12611fdcrfbq3,624f258b49e77713fc34034560f93fb3hu3jo,0.0,a3d2de7675556553a5f08e4c88d2c228v1sga,address_ae87054e5a97a8f840a3991d12611fdcrfbq3,account_a3d2de7675556553a5f08e4c88d2c228v1sga,3.0,auction_jefix auction_jefix auction_jefix,1.0,merchandise_officeequipment merchandise_office...,...,3.233579e+12,3.233579e+12,2.238168e+12,2.487021e+12,2.735874e+12,2.984726e+12,3.482432e+12,3.731284e+12,3.980137e+12,4.228989e+12
2,92520288b50f03907041887884ba49c0cl0pd,1c5f4fc669099bfbfac515cd26997bd12ruaj,0.0,a3d2de7675556553a5f08e4c88d2c2280cybl,address_92520288b50f03907041887884ba49c0cl0pd,account_a3d2de7675556553a5f08e4c88d2c2280cybl,4.0,auction_udb7l auction_0s731 auction_q47xb auct...,4.0,merchandise_sportinggoods merchandise_sporting...,...,2.379000e+12,2.532053e+12,1.667084e+12,1.883326e+12,2.099568e+12,2.315811e+12,2.656463e+12,2.780874e+12,2.905284e+12,3.029695e+12
3,4cb9717c8ad7e88a9a284989dd79b98dbevyi,4bee9aba2abda51bf43d639013d6efe12iycd,0.0,51d80e233f7b6a7dfdee484a3c120f3b2ita8,address_4cb9717c8ad7e88a9a284989dd79b98dbevyi,account_51d80e233f7b6a7dfdee484a3c120f3b2ita8,1.0,auction_1kbfl,1.0,merchandise_booksandmusic,...,3.488293e+12,2.212457e+12,1.527371e+12,1.673775e+12,1.832968e+12,2.011435e+12,2.616042e+12,3.119719e+12,3.923941e+12,5.600710e+12
4,2a96c3ce94b3be921e0296097b88b56a7x1ji,4ab12bc61c82ddd9c2d65e60555808acqgos1,0.0,a3d2de7675556553a5f08e4c88d2c22857ddh,address_2a96c3ce94b3be921e0296097b88b56a7x1ji,account_a3d2de7675556553a5f08e4c88d2c22857ddh,155.0,auction_gjknq auction_gjknq auction_gjknq auct...,23.0,merchandise_officeequipment merchandise_office...,...,7.777888e+10,1.368421e+10,1.368421e+09,3.452632e+09,5.836842e+09,9.042105e+09,1.941053e+10,3.305789e+10,5.542105e+10,1.581211e+11
5,5a1d8f28bc31aa6d72bef2d8fbf48b967hra3,7eaefc97fbf6af12e930528151f86eb91bafh,0.0,a3d2de7675556553a5f08e4c88d2c228yory1,address_5a1d8f28bc31aa6d72bef2d8fbf48b967hra3,account_a3d2de7675556553a5f08e4c88d2c228yory1,1.0,auction_jefix,1.0,merchandise_mobile,...,3.488293e+12,2.212457e+12,1.527371e+12,1.673775e+12,1.832968e+12,2.011435e+12,2.616042e+12,3.119719e+12,3.923941e+12,5.600710e+12
6,9a6d81115b9b653ba326eb510e9163b47drqj,25558d24bca82beef0f9db4ba1fe2045ynnvq,0.0,81580585d4dedd473da11aabf37fe9d4e2s2n,address_9a6d81115b9b653ba326eb510e9163b47drqj,account_81580585d4dedd473da11aabf37fe9d4e2s2n,8.0,auction_udb7l auction_hu49k auction_rir9y auct...,8.0,merchandise_mobile merchandise_mobile merchand...,...,1.598902e+12,5.842105e+10,1.090526e+10,2.336842e+10,4.642105e+10,5.583158e+10,4.916211e+11,1.306063e+12,2.882989e+12,4.793705e+12
7,3a7e6a32b24aeab0688e91a41f3188e22iuec,88ae7a35e374a6fddd079ebb28c822eeohwse,0.0,a3d2de7675556553a5f08e4c88d2c2289zref,address_3a7e6a32b24aeab0688e91a41f3188e22iuec,account_a3d2de7675556553a5f08e4c88d2c2289zref,34.0,auction_t79cc auction_im3yk auction_gawb5 auct...,10.0,merchandise_homegoods merchandise_homegoods me...,...,1.696077e+11,3.300000e+10,2.536842e+09,8.252632e+09,1.774737e+10,2.227368e+10,4.756842e+10,6.692632e+10,1.074421e+11,5.606526e+11
8,31b95425d178b89fd7306762bb48bfb5n04sj,57db69e32163f3e486dc6ef7d615aa12usje6,0.0,bf1c3151cc309308077ad0ccb99779ad12apw,address_31b95425d178b89fd7306762bb48bfb5n04sj,account_bf1c3151cc309308077ad0ccb99779ad12apw,2.0,auction_cf3it auction_rf9qs,2.0,merchandise_jewelry merchandise_jewelry,...,6.821989e+13,6.821989e+13,6.821989e+13,6.821989e+13,6.821989e+13,6.821989e+13,6.821989e+13,6.821989e+13,6.821989e+13,6.821989e+13
9,5b1f6e97a1cc27cd7fa9a3fe17eccd2a6mpdv,d1be739798ba0745a1fd72ac918a9f1929hei,0.0,f49162ea9903fc00e4721d2f7972df9d6az4s,address_5b1f6e97a1cc27cd7fa9a3fe17eccd2a6mpdv,account_f49162ea9903fc00e4721d2f7972df9d6az4s,14.0,auction_7tvta auction_1ly3m auction_9ul86 auct...,10.0,merchandise_mobile merchandise_mobile merchand...,...,5.313915e+12,7.534211e+11,4.155789e+10,2.446737e+11,3.423474e+11,3.588316e+11,1.000758e+12,1.412211e+12,1.604621e+12,3.789274e+12


In [17]:
test_dat

Unnamed: 0,address,bidder_id,outcome,payment_account,address_text,account_text,auction_count_num,auction_text,auction_nunique_num,merchandise_text,...,meandiff_num,mediandiff_num,diff_quantile_num_0_1,diff_quantile_num_0_2,diff_quantile_num_0_3,diff_quantile_num_0_4,diff_quantile_num_0_6,diff_quantile_num_0_7,diff_quantile_num_0_8,diff_quantile_num_0_9
2013,5d9fa1b71f992e7c7a106ce4b07a0a754le7c,49bb5a3c944b8fc337981cc7a9ccae41u31d7,,a3d2de7675556553a5f08e4c88d2c228htx90,address_5d9fa1b71f992e7c7a106ce4b07a0a754le7c,account_a3d2de7675556553a5f08e4c88d2c228htx90,4.000000,auction_cl3cf auction_edggv auction_jqx39 auct...,3.000000,merchandise_homegoods merchandise_homegoods me...,...,2.340789e+13,5.781053e+12,4.792547e+12,5.039674e+12,5.286800e+12,5.533926e+12,1.660428e+13,2.742752e+13,3.825075e+13,4.907398e+13
2014,a3d2de7675556553a5f08e4c88d2c228klidn,a921612b85a1494456e74c09393ccb65ylp4y,,a3d2de7675556553a5f08e4c88d2c228rs17i,address_a3d2de7675556553a5f08e4c88d2c228klidn,account_a3d2de7675556553a5f08e4c88d2c228rs17i,3.000000,auction_h2nr3 auction_h2nr3 auction_zqdip,2.000000,merchandise_sportinggoods merchandise_sporting...,...,3.800103e+13,3.800103e+13,7.600289e+12,1.520047e+13,2.280066e+13,3.040084e+13,4.560121e+13,5.320139e+13,6.080158e+13,6.840176e+13
2015,a3d2de7675556553a5f08e4c88d2c228aght0,6b601e72a4d264dab9ace9d7b229b47479v6i,,925381cce086b8cc9594eee1c77edf665zjpl,address_a3d2de7675556553a5f08e4c88d2c228aght0,account_925381cce086b8cc9594eee1c77edf665zjpl,17.000000,auction_xe9nl auction_jr6l5 auction_z51fk auct...,14.000000,merchandise_mobile merchandise_mobile merchand...,...,1.819079e+10,2.315789e+09,4.210526e+08,7.894737e+08,1.078947e+09,1.315789e+09,2.736842e+09,3.894737e+09,6.210526e+09,7.868421e+09
2016,b5714de1fd69d4a0d2e39d59e53fe9e15vwat,eaf0ed0afc9689779417274b4791726cn5udi,,a3d2de7675556553a5f08e4c88d2c228nclv5,address_b5714de1fd69d4a0d2e39d59e53fe9e15vwat,account_a3d2de7675556553a5f08e4c88d2c228nclv5,148.000000,auction_ojbfm auction_xscbv auction_w91wy auct...,90.000000,merchandise_furniture merchandise_furniture me...,...,5.205553e+11,1.046842e+11,8.221053e+09,1.928421e+10,3.690526e+10,6.965263e+10,1.461895e+11,2.167053e+11,3.142947e+11,4.884421e+11
2017,c3b363a3c3b838d58c85acf0fc9964cb4pnfa,cdecd8d02ed8c6037e38042c7745f688mx5sf,,a3d2de7675556553a5f08e4c88d2c228dtdkd,address_c3b363a3c3b838d58c85acf0fc9964cb4pnfa,account_a3d2de7675556553a5f08e4c88d2c228dtdkd,23.000000,auction_faxz5 auction_37emg auction_8i4hi auct...,20.000000,merchandise_jewelry merchandise_jewelry mercha...,...,2.988541e+11,8.131579e+09,7.105263e+09,7.157895e+09,7.489474e+09,7.663158e+09,1.000000e+10,2.361053e+10,3.814737e+10,1.229063e+12
2018,913a23ad701018bedd9d558f236f878267nrk,d4aed439bdc854a56fc6cc3bdb986775w7hxw,,a3d2de7675556553a5f08e4c88d2c228v4x1f,address_913a23ad701018bedd9d558f236f878267nrk,account_a3d2de7675556553a5f08e4c88d2c228v4x1f,232.000000,auction_tep1m auction_mccke auction_mccke auct...,96.000000,merchandise_jewelry merchandise_jewelry mercha...,...,5.800205e+10,2.868421e+10,5.526316e+09,1.010526e+10,1.405263e+10,1.778947e+10,3.831579e+10,5.463158e+10,7.342105e+10,1.264737e+11
2019,a3d2de7675556553a5f08e4c88d2c228i8yzg,ed591299b162a19ff77f0479495831b31hl1q,,a3d2de7675556553a5f08e4c88d2c228dpzl7,address_a3d2de7675556553a5f08e4c88d2c228i8yzg,account_a3d2de7675556553a5f08e4c88d2c228dpzl7,4.000000,auction_csbuq auction_92evm auction_7nedy auct...,4.000000,merchandise_homegoods merchandise_homegoods me...,...,2.237649e+12,1.372000e+12,7.012211e+11,8.689158e+11,1.036611e+12,1.204305e+12,2.059084e+12,2.746168e+12,3.433253e+12,4.120337e+12
2020,7161605c31a8bbf7fc3fb0d77a26c163vdgvg,eebdee08b0f67283126ef60307f49680sb9va,,4eb7c53cbfbd4befdedb5854526d1907ct8xt,address_7161605c31a8bbf7fc3fb0d77a26c163vdgvg,account_4eb7c53cbfbd4befdedb5854526d1907ct8xt,383.000000,auction_165y9 auction_0cwrz auction_fnetm auct...,158.000000,merchandise_mobile merchandise_mobile merchand...,...,3.553293e+10,1.776316e+10,2.315789e+09,5.389474e+09,8.136842e+09,1.249474e+10,2.608421e+10,3.763684e+10,5.736842e+10,9.323684e+10
2021,1a2569f191c87f8bdae1bd23506f0ff5tnfvs,6887f0abc4eb4c79eb0e23c48ceea186vjfih,,a3d2de7675556553a5f08e4c88d2c228vufvy,address_1a2569f191c87f8bdae1bd23506f0ff5tnfvs,account_a3d2de7675556553a5f08e4c88d2c228vufvy,1.000000,auction_jqx39,1.000000,merchandise_mobile,...,3.488293e+12,2.212457e+12,1.527371e+12,1.673775e+12,1.832968e+12,2.011435e+12,2.616042e+12,3.119719e+12,3.923941e+12,5.600710e+12
2022,29ba44254742e88a665e8fced9fc82b7947da,37eb6e2979e66d4ce29a74ac1c8bc6a5lqs6t,,a3d2de7675556553a5f08e4c88d2c22838x5m,address_29ba44254742e88a665e8fced9fc82b7947da,account_a3d2de7675556553a5f08e4c88d2c22838x5m,127.000000,auction_h4jls auction_jefix auction_jefix auct...,17.000000,merchandise_jewelry merchandise_jewelry mercha...,...,1.056374e+11,6.189474e+10,7.605263e+09,1.573684e+10,3.073684e+10,4.663158e+10,9.647368e+10,1.266842e+11,1.547895e+11,2.531316e+11


In [20]:
train_dat = df_combo[~pd.isnull(df_combo.outcome)]
y = train_dat.outcome.values
# xtrain = train_dat[num_cols].values
xtrainfinal = train_dat.drop('outcome', axis=1)
xtrain = xtrainfinal.select_dtypes(include=['float64']).values
# xtest = test_dat[num_cols].values
xtestfinal = test_dat.drop('outcome', axis=1)
xtest = xtestfinal.select_dtypes(include=['float64']).values
# col_names = num_cols

In [21]:
xtrain

array([[  2.40000000e+01,   1.80000000e+01,   1.00000000e+00, ...,
          5.22778947e+11,   8.32136842e+11,   1.29993684e+12],
       [  3.00000000e+00,   1.00000000e+00,   1.00000000e+00, ...,
          3.73128421e+12,   3.98013684e+12,   4.22898947e+12],
       [  4.00000000e+00,   4.00000000e+00,   1.00000000e+00, ...,
          2.78087368e+12,   2.90528421e+12,   3.02969474e+12],
       ..., 
       [  2.00000000e+00,   1.00000000e+00,   1.00000000e+00, ...,
          9.79847368e+12,   9.79847368e+12,   9.79847368e+12],
       [  1.00000000e+00,   1.00000000e+00,   1.00000000e+00, ...,
          3.11971913e+12,   3.92394139e+12,   5.60071049e+12],
       [  2.00000000e+00,   1.00000000e+00,   1.00000000e+00, ...,
          1.78947368e+09,   1.78947368e+09,   1.78947368e+09]])

In [22]:
sc = MinMaxScaler().fit(np.vstack((xtrain, xtest)))
xtrain = sc.transform(xtrain)
xtest = sc.transform(xtest)

In [23]:
xtrain = sparse.csr_matrix(xtrain)
xtest = sparse.csr_matrix(xtest)

In [24]:
def tokens(x):
    return x.split(' ')

In [25]:
text_params = {}
text_params['address_text'] = {'include':False}
text_params['account_text'] = {'include':False}
text_params['auction_text'] = {'include':True, 'mindf':5, 'ngram':(1,3), 'token':'tokens'}
text_params['merchandise_text'] = {'include':True, 'mindf':5, 'ngram':(1,3), 'token':'tokens'}
text_params['device_text'] = {'include':True, 'mindf':5, 'ngram':(1,3), 'token':'tokens'}
text_params['timediff_text'] = {'include':True, 'mindf':1, 'ngram':(1,1), 'token':'tokens'}
text_params['country_text'] = {'include':True, 'mindf':5, 'ngram':(1,3), 'token':'tokens'}
text_params['ip_text'] = {'include':True, 'mindf':1, 'ngram':(1,1), 'token':'nottokens'}
text_params['url_text'] = {'include':True, 'mindf':5, 'ngram':(1,3), 'token':'tokens'}

In [26]:
for col in text_cols:
    if not text_params[col]['include']:
        continue
    else:
        if text_params[col]['token'] == 'tokens':
            vect = TfidfVectorizer(tokenizer = tokens, min_df = text_params[col]['mindf'],
                                   ngram_range = text_params[col]['ngram'])
        else:
            vect = TfidfVectorizer(min_df = text_params[col]['mindf'],
                                   ngram_range = text_params[col]['ngram'])
        documents = df_combo[col].values
        vect.fit(documents)
        col_names = col_names + vect.get_feature_names()
        xtr_tmp = vect.transform(train_dat[col].values)
        xte_tmp = vect.transform(test_dat[col].values)
        xtrain = sparse.hstack((xtrain, xtr_tmp))
        xtest = sparse.hstack((xtest, xte_tmp))

In [28]:
joblib.dump(xtrain, '../PROJECT/xtrain.pkl', compress = 3);
joblib.dump(y, '../PROJECT/y.pkl', compress = 3);

In [29]:
#feature selection
feats_25 = SelectPercentile(chi2, 25).fit(xtrain, y)
xtrain = feats_25.transform(xtrain)
xtest = feats_25.transform(xtest)

clf = xgb.XGBClassifier(objective = 'binary:logistic',
                            learning_rate = 0.05,
                            max_depth = 5,
                            nthread = 8,
                            seed = 42,
                            subsample = 0.4,
                            colsample_bytree = 0.7,
                            min_child_weight = 1,
                            n_estimators = 100,
                            gamma = 0.15, silent = True)

#bag of 15 models
rounds = 15
preds_mat = np.zeros((len(sample.index), rounds))
for i in range(rounds):
    clf.set_params(seed = i + 1)
    clf.fit(xtrain, y)
    preds_tmp = clf.predict_proba(xtest)[:, 1]
    preds_mat[:, i] = preds_tmp
bagged_preds = preds_mat.mean(axis = 1)
sample.prediction = bagged_preds
sample.to_csv('submissions/facebook_submission.csv', index = False)

NameError: name 'xgb' is not defined

In [31]:
col_names = np.array(col_names)[feats_25.get_support()]

NameError: name 'col_names' is not defined

In [32]:
xgb_params = {'objective': 'binary:logistic',
              'eta': 0.05,
              'max_depth': 5,
              'seed': 42,
              'subsample': 0.4,
              'colsample_bytree': 0.7,
              'min_child_weight': 1,
              'gamma': 0.15}
num_round = 100
dtrain = xgb.DMatrix(xtrain, label = y)
booster = xgb.train(xgb_params, dtrain, num_round)

NameError: name 'xgb' is not defined

In [33]:
importance = booster.get_fscore()
df_imp = pd.DataFrame(columns = ('feature', 'importance'))
sum_imp = 0
for imp in importance:
    row = col_names[int(imp[1:])], importance[imp]
    df_imp.loc[len(df_imp.index)] = row
    sum_imp += importance[imp]
df_imp['relative_importance'] = df_imp.importance/float(sum_imp)
df_imp.drop(['importance'], 1, inplace = True)
df_imp.sort(columns=['relative_importance'], ascending = False, inplace = True)
df_imp.index = df_imp.feature

NameError: name 'booster' is not defined

In [None]:
pd.options.mode.chained_assignment=None
df_plot = df_imp[:20]
names = {'country':'Country', 'num': 'Numeric', 'timediff': 'Time difference',
         'device':'Device', 'url':'URL', 'ip':'IP address'}
for pattern in sorted(names.keys()):
    df_plot[names[pattern]] = 0
    if pattern == 'ip':
        rows = df_plot.feature.str.isnumeric()
    elif pattern == 'timediff':
        rows = df_plot.feature.str.contains(pattern) & ~df_plot.feature.str.contains('num')
    else:
        rows = df_plot.feature.str.contains(pattern)
    df_plot.loc[rows, names[pattern]] = df_plot.loc[rows, 'relative_importance']
df_plot = df_plot.drop(['feature', 'relative_importance'], 1)

In [None]:
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
matplotlib.style.use('ggplot')
matplotlib.rcParams.update({'font.size': 13})
matplotlib.rcParams['figure.figsize'] = 9, 6
ax = df_plot.plot(kind='barh', stacked = True)
ax.invert_yaxis()
plt.xlabel('Relative importance')
plt.ylabel('Feature');

In [None]:
import pandas as pd
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import os

def score(params):
    global df_scores
    params['n_estimators'] = int(params['n_estimators'])
    print "Training with params : "
    print params
    sel_pct = int(params['sel_pct'])
    del params['sel_pct']
    clf = xgb.XGBClassifier()
    clf.set_params(**params)
    pipeline = Pipeline([('selector', SelectPercentile(chi2, sel_pct)),
                         ('clf', clf)])
    scores = cross_val_score(pipeline, xtrain, y, scoring = 'roc_auc',cv = kf)
    score = scores.mean()
    print "\tScore {0}\n\n".format(score)
    row = [score, params['n_estimators'], params['learning_rate'],
           params['max_depth'], params['min_child_weight'],
           params['subsample'], params['gamma'],
           params['colsample_bytree'], sel_pct]
    df_scores.loc[len(df_scores.index)] = row
    df_scores.sort(columns = 'score', ascending = False, inplace = True)
    df_scores.to_csv(fname, index = False)
    return {'loss': score, 'status': STATUS_OK}

def optimize(trials):
    space = {
             'n_estimators' : hp.quniform('n_estimators', 5, 1000, 1),
             'learning_rate' : hp.quniform('learning_rate', 0.001, 0.5, 0.001),
             'max_depth' : hp.quniform('max_depth', 1, 13, 1),
             'min_child_weight' : hp.quniform('min_child_weight', 1, 6, 1),
             'subsample' : hp.quniform('subsample', 0.4, 1, 0.05),
             'gamma' : hp.quniform('gamma', 0, 1, 0.05),
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.4, 1, 0.05),
             'sel_pct' : hp.quniform('sel_pct', 1, 100, 1),
             'objective' : 'binary:logistic',
             'silent' : 1
             }
    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=500)
    print best

xtrain = joblib.load('data/xtrain.pkl')
y = joblib.load('data/y.pkl')
nf = 4
kf = StratifiedKFold(y, n_folds = nf, random_state = 42, shuffle = True)
fname = 'hyperopt_xgb.csv'
if os.path.isfile(fname):
    df_scores = pd.read_csv(fname)
else:
    df_scores = pd.DataFrame(columns = ('score', 'n_estimators','learning_rate',
                                     'max_depth', 'min_child_weight',
                                     'subsample', 'gamma',
                                     'colsample_bytree', 'sel_pct'))
trials = Trials()
optimize(trials)