In [1]:
import numpy as np
from scipy import sparse
import pandas as pd
import xgboost as xgb
import re
import string
import time

from sklearn import preprocessing, pipeline, metrics, model_selection
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import feature_selection
from itertools import product
%matplotlib inline 

In [2]:
train_data = pd.read_json('/Users/Stephanie_Zhang/Desktop/桌面/3.DS模板/代码模板/kaggle_twosigma/train.json')
test_data = pd.read_json('/Users/Stephanie_Zhang/Desktop/桌面/3.DS模板/代码模板/kaggle_twosigma/test.json')

In [3]:
train_size = train_data.shape[0]

### Create target variables

In [4]:
train_data['target'] = train_data['interest_level'].apply(lambda x: 0 if x=='low' else 1 if x=='medium' else 2)
train_data['low'] = train_data['interest_level'].apply(lambda x: 1 if x=='low' else 0)
train_data['medium'] = train_data['interest_level'].apply(lambda x: 1 if x=='medium' else 0)
train_data['high'] = train_data['interest_level'].apply(lambda x: 1 if x=='high' else 0)

## Merge training and testing data

In [5]:
full_data=pd.concat([train_data
                       ,test_data])

### Group Variables

In [6]:
num_vars = ['bathrooms','bedrooms','latitude','longitude','price']
cat_vars = ['building_id','manager_id','display_address','street_address']
text_vars = ['description','features']
date_var = 'created'
image_var = 'photos'
id_var = 'listing_id'

In [7]:
full_data.head(5)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,high,interest_level,latitude,listing_id,longitude,low,manager_id,medium,photos,price,street_address,target
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],0.0,medium,40.7145,7211212,-73.9425,0.0,5ba989232d0489da1b5f2c45f6688adc,1.0,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,1.0
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",0.0,low,40.7947,7150865,-73.9667,1.0,7533621a882f71e25173b27e3139d83d,0.0,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,0.0
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",1.0,high,40.7388,6887163,-74.0018,0.0,d9039c43983f6e564b1482b273bd7b01,0.0,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street,2.0
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",0.0,low,40.7539,6888711,-73.9677,1.0,1067e078446a7897d2da493d2f741316,0.0,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street,0.0
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],0.0,low,40.8241,6934781,-73.9493,1.0,98e13ad4b495b9613cef886d79a6291f,0.0,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street,0.0


### Date variable

In [8]:
full_data['created_datetime'] = pd.to_datetime(full_data['created'], format="%Y-%m-%d %H:%M:%S")

In [9]:
# full_data['created_year']=full_data['created_datetime'].apply(lambda x:x.year) ## low variant
full_data['created_month']=full_data['created_datetime'].apply(lambda x:x.month)
full_data['created_day']=full_data['created_datetime'].apply(lambda x:x.day)
full_data['created_dayofweek']=full_data['created_datetime'].apply(lambda x:x.dayofweek)
full_data['created_dayofyear']=full_data['created_datetime'].apply(lambda x:x.dayofyear)
full_data['created_weekofyear']=full_data['created_datetime'].apply(lambda x:x.weekofyear)
full_data['created_hour']=full_data['created_datetime'].apply(lambda x:x.hour)
full_data['created_epoch']=full_data['created_datetime'].apply(lambda x:x.value//10**9)

date_num_vars = ['created_month','created_dayofweek','created_dayofyear'
                 ,'created_weekofyear','created_hour','created_epoch']

## Additional Numeric Variables

In [10]:
# full_data['price']=full_data['price'].apply(np.log)

In [11]:
full_data['rooms'] = full_data['bedrooms'] + full_data['bathrooms'] 
full_data['num_of_photos'] = full_data['photos'].apply(lambda x:len(x))
full_data['num_of_features'] = full_data['features'].apply(lambda x:len(x))
full_data['len_of_desc'] = full_data['description'].apply(lambda x:len(x))
full_data['words_of_desc'] = full_data['description'].apply(lambda x:len(re.sub('['+string.punctuation+']', '', x).split()))


full_data['nums_of_desc'] = full_data['description']\
        .apply(lambda x:re.sub('['+string.punctuation+']', '', x).split())\
        .apply(lambda x: len([s for s in x if s.isdigit()]))
        
full_data['has_phone'] = full_data['description'].apply(lambda x:re.sub('['+string.punctuation+']', '', x).split())\
        .apply(lambda x: [s for s in x if s.isdigit()])\
        .apply(lambda x: len([s for s in x if len(str(s))==10]))\
        .apply(lambda x: 1 if x>0 else 0)
full_data['has_email'] = full_data['description'].apply(lambda x: 1 if '@renthop.com' in x else 0)

additional_num_vars = ['rooms','num_of_photos','num_of_features','len_of_desc',
                    'words_of_desc','has_phone','has_email']

### Numeric interactions

In [12]:
full_data['avg_word_len'] = full_data[['len_of_desc','words_of_desc']]\
                                    .apply(lambda x: x[0]/x[1] if x[1]!=0 else 0, axis=1)
    
full_data['price_per_room'] = full_data[['price','rooms']].apply(lambda x: x[0]/x[1] if x[1]!=0 else 0, axis=1)
full_data['price_per_bedroom'] = full_data[['price','bedrooms']].apply(lambda x: x[0]/x[1] if x[1]!=0 else 0, axis=1)
full_data['price_per_bathroom'] = full_data[['price','bathrooms']].apply(lambda x: x[0]/x[1] if x[1]!=0 else 0, axis=1)
full_data['price_per_photo'] = full_data[['price','num_of_photos']].apply(lambda x: x[0]/x[1] if x[1]!=0 else 0, axis=1)


full_data['photos_per_room'] = full_data[['num_of_photos','rooms']].apply(lambda x: x[0]/x[1] if x[1]!=0 else 0, axis=1)

interactive_num_vars = ['avg_word_len','price_per_room','price_per_bedroom','price_per_bathroom','price_per_photo',
                        'photos_per_room']

### Categorical Features

#### 1. Label Encoding

In [13]:
LBL = preprocessing.LabelEncoder()

LE_vars=[]
LE_map=dict()
for cat_var in cat_vars:
    print ("Label Encoding %s" % (cat_var))
    LE_var=cat_var+'_le'
    full_data[LE_var]=LBL.fit_transform(full_data[cat_var])
    LE_vars.append(LE_var)
    LE_map[cat_var]=LBL.classes_
    
print ("Label-encoded feaures: %s" % (LE_vars))

Label Encoding building_id
Label Encoding manager_id
Label Encoding display_address
Label Encoding street_address
Label-encoded feaures: ['building_id_le', 'manager_id_le', 'display_address_le', 'street_address_le']


#### 2. One Hot Encoding

In [14]:
OHE = preprocessing.OneHotEncoder(sparse=True)
start=time.time()
OHE.fit(full_data[LE_vars])
OHE_sparse=OHE.transform(full_data[LE_vars])
                                   
print ('One-hot-encoding finished in %f seconds' % (time.time()-start))


OHE_vars = [var[:-3] + '_' + str(level).replace(' ','_')\
                for var in cat_vars for level in LE_map[var] ]

print ("OHE_sparse size :" ,OHE_sparse.shape)
print ("One-hot encoded catgorical feature samples : %s" % (OHE_vars[:100]))

One-hot-encoding finished in 0.218343 seconds
OHE_sparse size : (124011, 57868)
One-hot encoded catgorical feature samples : ['building_0', 'building_00005cb939f9986300d987652c933e15', 'building_00024d77a43f0606f926e2312513845c', 'building_000ae4b7db298401cdae2b0ba1ea8146', 'building_0012f1955391bca600ec301035b97b65', 'building_0021440c04241281a436ec21accc40b1', 'building_002d1eba40aa0a6610e04ff20543585f', 'building_003d8740e21484dcc2280639b25539a4', 'building_00480e54b53fe77d17964be3f8307a99', 'building_00553d95d22484bcc36831c9248d1dbc', 'building_0055c8662ba19e95f78df97592d2b83e', 'building_0056dbdf2881b76f2a0171eb753ec9e0', 'building_0059ae562b9e338a59eaf962cb3eedd2', 'building_005e0f8d7fb7b92be351cbf1dd985149', 'building_0067f166111490e7af7f1a878a67bb5e', 'building_0070bc94a3f80aa717bb15708e98ba54', 'building_0071cda335745940cdae1dc31abfc701', 'building_0078281cd69f4bfec17e42e5cf5eecd9', 'building_0078c2ab46afba9969637ac83621901e', 'building_007ae1cd90420f18bad7b6892a9a1411', 'buil

#### 3. Leave-one-out Encoding

Based on the paper "A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems"

http://helios.mm.di.uoa.gr/~rouvas/ssi/sigkdd/sigkdd.vol3.1/barreca.ps

** A couple of Kaggle scripts: **

R version: by Braden Murray: https://www.kaggle.com/brandenkmurray/two-sigma-connect-rental-listing-inquiries/it-is-lit/comments

Python Version 1, by Stanislav Ushakov
https://www.kaggle.com/stanislavushakov/two-sigma-connect-rental-listing-inquiries/python-version-of-it-is-lit-by-branden/comments

Python Version 2, by Rakhlin
https://www.kaggle.com/rakhlin/two-sigma-connect-rental-listing-inquiries/another-python-version-of-it-is-lit-by-branden/code


In [17]:
def designate_single_observations(df1, df2, column):
    ps = df1[column].append(df2[column])
    grouped = ps.groupby(ps).size().to_frame().rename(columns={0: "size"})
    df1.loc[df1.join(grouped, on=column, how="left")["size"] <= 1, column] = -1
    df2.loc[df2.join(grouped, on=column, how="left")["size"] <= 1, column] = -1
    return df1, df2

In [21]:
ps = train_data['building_id'].append(test_data['building_id'])

In [30]:
ps.head(20)

10        53a5b119ba8f7b61d4e010512e0dfc85
10000     c5c8a357cba207596b04d1afd1e4f130
100004    c3ba40552e2120b0acfc3cb5730bb2aa
100007    28d9ad350afeaab8027513a3e52ac8d5
100013                                   0
100014    38a913e46c94a7f46ddf19b756a9640c
100016    3ba49a93260ca5df92fde024cb4ca61f
100020    0372927bcb6a0949613ef5bf893bbac7
100026    a7efbeb58190aa267b4a9121cd0c88c0
100027                                   0
100030                                   0
10004                                    0
100044    67c9b420da4a365bc26a6cd0ef4a5320
100048                                   0
10005                                    0
100051    bfb9405149bfff42a92980b594c28234
100052    642cc2c920512ffe2a74c28122f8b47f
100053                                   0
100055    cc4c6ae9225df6d2395c4e16c235f7ab
100058    dc3cae15729b48fec3394f9295671991
Name: building_id, dtype: object

In [22]:
grouped = ps.groupby(ps).size().to_frame().rename(columns={0: "size"})

In [29]:
grouped.head(20)

Unnamed: 0_level_0,building_id
building_id,Unnamed: 1_level_1
0,20664
00005cb939f9986300d987652c933e15,1
00024d77a43f0606f926e2312513845c,7
000ae4b7db298401cdae2b0ba1ea8146,2
0012f1955391bca600ec301035b97b65,1
0021440c04241281a436ec21accc40b1,20
002d1eba40aa0a6610e04ff20543585f,2
003d8740e21484dcc2280639b25539a4,1
00480e54b53fe77d17964be3f8307a99,3
00553d95d22484bcc36831c9248d1dbc,11


In [15]:
##Create a function to encode high-cardinality cateogrical features

def designate_single_observations(df1, df2, column):
    ps = df1[column].append(df2[column])
    grouped = ps.groupby(ps).size().to_frame().rename(columns={0: "size"})
    df1.loc[df1.join(grouped, on=column, how="left")["size"] <= 1, column] = -1
    df2.loc[df2.join(grouped, on=column, how="left")["size"] <= 1, column] = -1
    return df1, df2


def hcc_encode(train_df, test_df, variable, target, prior_prob, k, f=1, g=1, r_k=None, update_df=None):
    """
    See "A Preprocessing Scheme for High-Cardinality Categorical Attributes in
    Classification and Prediction Problems" by Daniele Micci-Barreca
    """
    hcc_name = "_".join(["hcc", variable, target])

    grouped = train_df.groupby(variable)[target].agg({"size": "size", "mean": "mean"})
    grouped["lambda"] = 1 / (g + np.exp((k - grouped["size"]) / f))
    grouped[hcc_name] = grouped["lambda"] * grouped["mean"] + (1 - grouped["lambda"]) * prior_prob

    df = test_df[[variable]].join(grouped, on=variable, how="left")[hcc_name].fillna(prior_prob)
    if r_k: df *= np.random.uniform(1 - r_k, 1 + r_k, len(test_df))     # Add uniform noise. Not mentioned in original paper

    if update_df is None: update_df = test_df
    if hcc_name not in update_df.columns: update_df[hcc_name] = np.nan
    update_df.update(df)
    return

In [31]:
# for col in ('building_id', 'manager_id', 'display_address'):
#     train_data, test_data = designate_single_observations(train_data, test_data, col)
    
prior_low, prior_medium, prior_high = train_data[["low", "medium", "high"]].mean() 

skf = model_selection.StratifiedKFold(5)
attributes = product(("building_id", "manager_id"), zip(("medium", "high"), (prior_medium, prior_high)))
for variable, (target, prior) in attributes:
    hcc_encode(train_data, test_data, variable, target, prior, k=5, r_k=None)
    for train, test in skf.split(np.zeros(len(train_data)), train_data['interest_level']):
        hcc_encode(train_data.iloc[train], train_data.iloc[test], variable, target, prior, k=5, r_k=0.01,
                   update_df=train_data)
        
hcc_data = pd.concat([train_data[['building_id', 'manager_id', 'display_address',
            'hcc_building_id_medium','hcc_building_id_high',
            'hcc_manager_id_medium','hcc_manager_id_high']],
           test_data[['building_id', 'manager_id', 'display_address',
            'hcc_building_id_medium','hcc_building_id_high',
            'hcc_manager_id_medium','hcc_manager_id_high']]
           ]
          )
full_data['building_id'] = hcc_data['building_id']
full_data['manager_id'] = hcc_data['manager_id']
full_data['display_address'] = hcc_data['display_address']
full_data['hcc_building_id_medium'] = hcc_data['hcc_building_id_medium']
full_data['hcc_building_id_high'] = hcc_data['hcc_building_id_high']
full_data['hcc_manager_id_medium'] = hcc_data['hcc_manager_id_medium']
full_data['hcc_manager_id_high'] = hcc_data['hcc_manager_id_high']
hcc_vars = ['hcc_building_id_medium','hcc_building_id_high','hcc_manager_id_medium','hcc_manager_id_high']    

is deprecated and will be removed in a future version


### Text Features

#### 1. Features

In [32]:
full_data["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
cntvec = CountVectorizer(stop_words='english', max_features=200)
feature_sparse =cntvec.fit_transform(full_data["features"]\
                                     .apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x])))

feature_vars = ['feature_' + v for v in cntvec.vocabulary_]

#### 2. Description

##### tf-idf not working, instead, using CountVectorizer, not working either

In [33]:
# tfidf = TfidfVectorizer(stop_words='english', max_features=10)
# desc_sparse = tfidf.fit_transform(full_data["description"])
# desc_vars = ['desc_' + v for v in tfidf.get_feature_names()]

In [34]:
cntvec = CountVectorizer(stop_words='english', max_features=100)
desc_sparse = cntvec.fit_transform(full_data["description"])
desc_vars = ['desc_' + v for v in cntvec.vocabulary_]

##### word2vec - to be added

#### Street Address

In [35]:
cntvec = CountVectorizer(stop_words='english', max_features=10)
st_addr_sparse = cntvec.fit_transform(full_data["street_address"])
st_addr_vars = ['desc_' + v for v in cntvec.vocabulary_]

#### image vars - to be added

### Numberic vs Categorical Interactions

In [36]:
price_by_manager = full_data.groupby('manager_id')['price'].agg([np.min,np.max,np.median,np.mean]).reset_index()
price_by_manager.columns = ['manager_id','min_price_by_manager',
                            'max_price_by_manager','median_price_by_manager','mean_price_by_manager']
full_data = pd.merge(full_data,price_by_manager, how='left',on='manager_id')

price_by_building = full_data.groupby('building_id')['price'].agg([np.min,np.max,np.median,np.mean]).reset_index()
price_by_building.columns = ['building_id','min_price_by_building',
                            'max_price_by_building','median_price_by_building','mean_price_by_building']
full_data = pd.merge(full_data,price_by_building, how='left',on='building_id')

price_by_disp_addr = full_data.groupby('display_address')['price'].agg([np.min,np.max,np.median,np.mean]).reset_index()
price_by_disp_addr.columns = ['display_address','min_price_by_disp_addr',
                            'max_price_by_disp_addr','median_price_by_disp_addr','mean_price_by_disp_addr']
full_data = pd.merge(full_data,price_by_disp_addr, how='left',on='display_address')

num_cat_vars = ['median_price_by_manager','mean_price_by_manager',
                'median_price_by_building','mean_price_by_building',
                'median_price_by_disp_addr','mean_price_by_disp_addr'
               ]

full_data['price_percentile_by_manager']=\
            full_data[['price','min_price_by_manager','max_price_by_manager']]\
            .apply(lambda x:(x[0]-x[1])/(x[2]-x[1]) if (x[2]-x[1])!=0 else 0.5,
                  axis=1)
full_data['price_percentile_by_building']=\
            full_data[['price','min_price_by_building','max_price_by_building']]\
            .apply(lambda x:(x[0]-x[1])/(x[2]-x[1]) if (x[2]-x[1])!=0 else 0.5,
                  axis=1)
full_data['price_percentile_by_disp_addr']=\
            full_data[['price','min_price_by_disp_addr','max_price_by_disp_addr']]\
            .apply(lambda x:(x[0]-x[1])/(x[2]-x[1]) if (x[2]-x[1])!=0 else 0.5,
                  axis=1)

num_cat_vars.append('price_percentile_by_manager')
num_cat_vars.append('price_percentile_by_building')
num_cat_vars.append('price_percentile_by_disp_addr')

### Listing ID matters

In [37]:
min_listing_id = full_data['listing_id'].min()
max_listing_id = full_data['listing_id'].max()
full_data['listing_id_pos']=full_data['listing_id'].apply(lambda x:np.float64((x-min_listing_id+1))/(max_listing_id-min_listing_id+1))
num_vars.append('listing_id')
num_vars.append('listing_id_pos')

### Create training and testing data

In [38]:
%%time
##Baseline with features from "features" and street address

full_vars = num_vars + date_num_vars + additional_num_vars + interactive_num_vars + LE_vars + hcc_vars + num_cat_vars

train_x = sparse.hstack([full_data[full_vars], feature_sparse,st_addr_sparse]).tocsr()[:train_size]
train_y = full_data['target'][:train_size].values

test_x = sparse.hstack([full_data[full_vars], feature_sparse,st_addr_sparse]).tocsr()[train_size:]
test_y = full_data['target'][train_size:].values


full_vars = full_vars + feature_vars + st_addr_vars
print ("training data size: ", train_x.shape,"testing data size: ", test_x.shape)


params = dict()
params['objective'] = 'multi:softprob'
params['num_class'] = 3
params['eta'] = 0.1
params['max_depth'] = 6
params['min_child_weight'] = 1
params['subsample'] = 0.7
params['colsample_bytree'] = 0.7
params['gamma'] = 1
params['seed']=1234

cv_results = xgb.cv(params, xgb.DMatrix(train_x, label=train_y.reshape(train_x.shape[0],1)),
               num_boost_round=1000000, nfold=5,
       metrics={'mlogloss'},
       seed=1234,
       callbacks=[xgb.callback.early_stop(50)])


training data size:  (49352, 253) testing data size:  (74659, 253)
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[263]	train-mlogloss:0.336701+0.0015079	test-mlogloss:0.533393+0.00585737

CPU times: user 19min 51s, sys: 16.6 s, total: 20min 7s
Wall time: 11min 53s


## Manuanl Tuning

* Greedy-search
* Tune one parameter a time
* The results can be used for further tuning (by Bayesian Optimizer)

In [39]:
%%time
xgb_scores = pd.DataFrame()
scores = []
for max_depth in [3,4,5,6,7,8,9,10]:

    params = dict()
    params['objective'] = 'multi:softprob'
    params['num_class'] = 3
    params['eta'] = 0.1
    params['max_depth'] = max_depth
    params['min_child_weight'] = 1
    params['subsample'] = 1
    params['colsample_bytree'] = 1
    params['gamma'] = 0
    params['seed']=1234

    cv_results = xgb.cv(params, xgb.DMatrix(train_x, label=train_y.reshape(train_x.shape[0],1)),
                   num_boost_round=1000000,
                   nfold=5,
           metrics={'mlogloss'},
           seed=1234,
           callbacks=[xgb.callback.early_stop(50)])
    best_iteration = len(cv_results)
    best_score = cv_results['test-mlogloss-mean'].min()
    print (max_depth,best_score,best_iteration)
    scores.append([best_score,params['eta'],params['max_depth'],params['min_child_weight'],
                      params['colsample_bytree'],params['subsample'],params['gamma'],best_iteration])
xgb_scores = pd.concat([xgb_scores, pd.DataFrame(scores,columns=['score','eta','max_depth','min_child_weight',
                                   'colsample_bytree','subsample','gamma','best_iteration'])])    
best_max_depth = int(pd.DataFrame(scores,columns=['score','eta','max_depth','min_child_weight',
                                   'colsample_bytree','subsample','gamma','best_iteration']).\
                     sort_values(by='score',ascending=True)['max_depth'].values[0])
print ('best max_depth is', best_max_depth)

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[958]	train-mlogloss:0.429892+0.000876112	test-mlogloss:0.536051+0.0064062

3 0.5360507999999999 959
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[635]	train-mlogloss:0.393214+0.000905466	test-mlogloss:0.534002+0.00639002

4 0.5340022 636
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[380]	train-mlogloss:0.377163+0.00245438	test-mlogloss:0.534756+0.00590177

5 0.534756 381
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[266]	t

In [40]:
%%time
scores = []
for min_child_weight in [1,3,10,30,100]:

    params = dict()
    params['objective'] = 'multi:softprob'
    params['num_class'] = 3
    params['eta'] = 0.1
    params['max_depth'] = best_max_depth
    params['min_child_weight'] = min_child_weight
    params['subsample'] = 1
    params['colsample_bytree'] = 1
    params['gamma'] = 0
    params['seed']=1234

    cv_results = xgb.cv(params, xgb.DMatrix(train_x, label=train_y.reshape(train_x.shape[0],1)),
                   num_boost_round=1000000,
                   nfold=5,
           metrics={'mlogloss'},
           seed=1234,
           callbacks=[xgb.callback.early_stop(50)])
    best_iteration = len(cv_results)
    best_score = cv_results['test-mlogloss-mean'].min()
    print (min_child_weight,best_score,best_iteration)
    scores.append([best_score,params['eta'],params['max_depth'],params['min_child_weight'],
                      params['colsample_bytree'],params['subsample'],params['gamma'],best_iteration])
xgb_scores = pd.concat([xgb_scores, pd.DataFrame(scores,columns=['score','eta','max_depth','min_child_weight',
                                   'colsample_bytree','subsample','gamma','best_iteration'])])    
best_min_child_weight = int(pd.DataFrame(scores,columns=['score','eta','max_depth','min_child_weight',
                                   'colsample_bytree','subsample','gamma','best_iteration']).sort_values(by='score',ascending=True)['min_child_weight'].values[0])
print ('best min_child_weight is', best_min_child_weight)

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[635]	train-mlogloss:0.393214+0.000905466	test-mlogloss:0.534002+0.00639002

1 0.5340022 636
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[643]	train-mlogloss:0.393131+0.00151903	test-mlogloss:0.534294+0.00676678

3 0.5342935999999999 644
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[591]	train-mlogloss:0.403538+0.00180947	test-mlogloss:0.534165+0.00532355

10 0.5341646 592
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[573]

In [41]:
%%time
scores = []
for colsample_bytree in [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]:

    params = dict()
    params['objective'] = 'multi:softprob'
    params['num_class'] = 3
    params['eta'] = 0.1
    params['max_depth'] = best_max_depth
    params['min_child_weight'] = best_min_child_weight
    params['subsample'] = 1
    params['colsample_bytree'] = colsample_bytree
    params['gamma'] = 0
    params['seed']=1234

    cv_results = xgb.cv(params, xgb.DMatrix(train_x, label=train_y.reshape(train_x.shape[0],1)),
                   num_boost_round=1000000,
                   nfold=5,
           metrics={'mlogloss'},
           seed=1234,
           callbacks=[xgb.callback.early_stop(50)])
    best_iteration = len(cv_results)
    best_score = cv_results['test-mlogloss-mean'].min()
    print (colsample_bytree,best_score,best_iteration)
    scores.append([best_score,params['eta'],params['max_depth'],params['min_child_weight'],
                      params['colsample_bytree'],params['subsample'],params['gamma'],best_iteration])
xgb_scores = pd.concat([xgb_scores, pd.DataFrame(scores,columns=['score','eta','max_depth','min_child_weight',
                                   'colsample_bytree','subsample','gamma','best_iteration'])])    
best_colsample_bytree = pd.DataFrame(scores,columns=['score','eta','max_depth','min_child_weight',
                                   'colsample_bytree','subsample','gamma','best_iteration']).sort_values(by='score',ascending=True)['colsample_bytree'].values[0]


print ('best colsample_bytree is', best_colsample_bytree)

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[743]	train-mlogloss:0.414957+0.00172298	test-mlogloss:0.533292+0.00587709

0.2 0.5332918 744
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[716]	train-mlogloss:0.409833+0.00249919	test-mlogloss:0.53343+0.00661121

0.3 0.5334296000000001 717
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[627]	train-mlogloss:0.416875+0.00243065	test-mlogloss:0.53284+0.00596032

0.4 0.5328396 628
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[79

In [42]:
%%time
scores = []
for subsample in [0.6,0.7,0.8,0.9,1]:

    params = dict()
    params['objective'] = 'multi:softprob'
    params['num_class'] = 3
    params['eta'] = 0.1
    params['max_depth'] = best_max_depth
    params['min_child_weight'] = best_min_child_weight
    params['subsample'] = subsample
    params['colsample_bytree'] = best_colsample_bytree
    params['gamma'] = 0
    params['seed']=1234

    cv_results = xgb.cv(params, xgb.DMatrix(train_x, label=train_y.reshape(train_x.shape[0],1)),
                   num_boost_round=1000000,
                   nfold=5,
           metrics={'mlogloss'},
           seed=1234,
           callbacks=[xgb.callback.early_stop(50)])
    best_iteration = len(cv_results)
    best_score = cv_results['test-mlogloss-mean'].min()
    print (subsample,best_score,best_iteration)
    scores.append([best_score,params['eta'],params['max_depth'],params['min_child_weight'],
                      params['colsample_bytree'],params['subsample'],params['gamma'],best_iteration])
xgb_scores = pd.concat([xgb_scores, pd.DataFrame(scores,columns=['score','eta','max_depth','min_child_weight',
                                   'colsample_bytree','subsample','gamma','best_iteration'])])    
best_subsample = pd.DataFrame(scores,columns=['score','eta','max_depth','min_child_weight',
                                   'colsample_bytree','subsample','gamma','best_iteration']).sort_values(by='score',ascending=True)['subsample'].values[0]

print ('best subsample is', best_subsample)

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[622]	train-mlogloss:0.416074+0.001868	test-mlogloss:0.534226+0.005924

0.6 0.5342256 623
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[572]	train-mlogloss:0.4217+0.001403	test-mlogloss:0.533259+0.00626368

0.7 0.5332586 573
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[686]	train-mlogloss:0.403628+0.00206652	test-mlogloss:0.53219+0.00608093

0.8 0.5321904000000001 687
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[728]	trai

In [43]:
%%time
scores = []
for gamma in [0,0.5,1,1.5,2]:

    params = dict()
    params['objective'] = 'multi:softprob'
    params['num_class'] = 3
    params['eta'] = 0.1
    params['max_depth'] = best_max_depth
    params['min_child_weight'] = best_min_child_weight
    params['subsample'] = best_subsample
    params['colsample_bytree'] = best_colsample_bytree
    params['gamma'] = gamma
    params['seed']=1234

    cv_results = xgb.cv(params, xgb.DMatrix(train_x, label=train_y.reshape(train_x.shape[0],1)),
                   num_boost_round=1000000,
                   nfold=5,
           metrics={'mlogloss'},
           seed=1234,
           callbacks=[xgb.callback.early_stop(50)])
    best_iteration = len(cv_results)
    best_score = cv_results['test-mlogloss-mean'].min()
    print (gamma,best_score,best_iteration)
    scores.append([best_score,params['eta'],params['max_depth'],params['min_child_weight'],
                      params['colsample_bytree'],params['subsample'],params['gamma'],best_iteration])
xgb_scores = pd.concat([xgb_scores, pd.DataFrame(scores,columns=['score','eta','max_depth','min_child_weight',
                                   'colsample_bytree','subsample','gamma','best_iteration'])])    
best_gamma = pd.DataFrame(scores,columns=['score','eta','max_depth','min_child_weight',
                                   'colsample_bytree','subsample','gamma','best_iteration']).sort_values(by='score',ascending=True)['gamma'].values[0]

print ('best gamma is', best_gamma)

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[686]	train-mlogloss:0.403628+0.00206652	test-mlogloss:0.53219+0.00608093

0 0.5321904000000001 687
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[652]	train-mlogloss:0.408647+0.00169269	test-mlogloss:0.532065+0.00561747

0.5 0.5320654 653
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[736]	train-mlogloss:0.397305+0.00146662	test-mlogloss:0.532279+0.00555404

1 0.5322785999999999 737
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteratio

## Automated Tuning

* https://github.com/fmfn/BayesianOptimization


In [44]:
from bayes_opt import BayesianOptimization

xgtrain = xgb.DMatrix(train_x, label=train_y.reshape(train_x.shape[0],1))

def xgb_evaluate(min_child_weight,
                 colsample_bytree,
                 max_depth,
                 subsample,
                 gamma):
    params = dict()
    params['objective'] = 'multi:softprob'
    params['num_class'] = 3
    params['eta'] = 0.1
    params['max_depth'] = int(max_depth )   
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = colsample_bytree
    params['subsample'] = subsample
    params['gamma'] = gamma
    params['verbose_eval'] = True    


    cv_result = xgb.cv(params, xgtrain,
                       num_boost_round=100000,
                       nfold=5,
                       metrics={'mlogloss'},
                       seed=1234,
                       callbacks=[xgb.callback.early_stop(50)])

    return -cv_result['test-mlogloss-mean'].min()


xgb_BO = BayesianOptimization(xgb_evaluate, 
                             {'max_depth': (4, 8),
                              'min_child_weight': (0, 100),
                              'colsample_bytree': (0.2, 0.7),
                              'subsample': (0.7, 1),
                              'gamma': (0, 2)
                             }
                            )

xgb_BO.maximize(init_points=5, n_iter=40)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[664]	train-mlogloss:0.421339+0.000681717	test-mlogloss:0.532639+0.00573918

    1 | 09m08s | [35m  -0.53264[0m | [32m            0.4514[0m | [32m   1.3906[0m | [32m     4.7931[0m | [32m           51.7031[0m | [32m     0.8260[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[331]	train-mlogloss:0.379085+0.000902274	test-mlogloss:0.53208+0.00475385

    2 | 07m19s | [35m  -0.53208[0m | [32m            0.4619[0m |



[31mBayesian Optimization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[243]	train-mlogloss:0.35613+0.00131962	test-mlogloss:0.531627+0.00562743

    6 | 09m33s |   -0.53163 |             0.6821 |    1.9926 |      7.9487 |            17.7515 |      0.7957 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[455]	train-mlogloss:0.385036+0.000847493	test-mlogloss:0.533828+0.00565615

    7 | 06m28s |   -0.53383 |             0.2253 |    1.8849 |      7.9444 |            61.1723 |      0.7245 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[732]	train-mlogloss:0.380285+0.00116659	test-mlogloss:0.534154+0.00570804

    8 | 06m42s |   -0.53415 |             0.2481 |    0.0315 |      4.1202 |             0.0117 |      0.7285 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[771]	train-mlogloss:0.420303+0.00179888	test-mlogloss:0.534284+0.00581202

    9 | 14m15s |   -0.53428 |             0.6651 |    1.9934 |      4.1291 |            83.3612 |      0.7008 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[751]	train-mlogloss:0.395428+0.00167569	test-mlogloss:0.531833+0.00587994

   10 | 14m37s |   -0.53183 |             0.6875 |    1.9625 |      4.1302 |            18.6184 |      0.9508 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[850]	train-mlogloss:0.423143+0.000620016	test-mlogloss:0.53361+0.00591265

   11 | 07m39s |   -0.53361 |             0.2571 |    0.0310 |      4.0126 |            81.3784 |      0.9965 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[249]	train-mlogloss:0.317407+0.00205499	test-mlogloss:0.531648+0.00611509

   12 | 10m08s |   -0.53165 |             0.6963 |    0.0857 |      7.7857 |             7.4061 |      0.9980 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[297]	train-mlogloss:0.3626+0.00109793	test-mlogloss:0.530679+0.00569081

   13 | 11m19s | [35m  -0.53068[0m | [32m            0.6965[0m | [32m   1.7892[0m | [32m     7.8753[0m | [32m           39.1605[0m | [32m     0.9618[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[318]	train-mlogloss:0.392079+0.0013169

  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[306]	train-mlogloss:0.306799+0.00222005	test-mlogloss:0.530631+0.00650793

   16 | 09m52s | [35m  -0.53063[0m | [32m            0.5405[0m | [32m   1.9057[0m | [32m     7.9225[0m | [32m            0.2469[0m | [32m     0.9394[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[272]	train-mlogloss:0.374891+0.00191893	test-mlogloss:0.531085+0.00536618

   17 | 10m07s |   -0.53108 |             0.6754 |    0.0773 |      7.9590 |            47.4133 |      0.9948 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[400]	train-mlogloss:0.335806+0.00063

  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[725]	train-mlogloss:0.411064+0.00172843	test-mlogloss:0.533173+0.00570503

   19 | 08m37s |   -0.53317 |             0.3743 |    0.0345 |      4.3025 |            39.7207 |      0.9970 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[757]	train-mlogloss:0.383803+0.00157551	test-mlogloss:0.531284+0.00544502

   20 | 14m32s |   -0.53128 |             0.6942 |    1.9944 |      4.4958 |             5.2424 |      0.9152 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[295]	train-mlogloss:0.359631+0.00222183	test-mlogloss:0.530929+0.00595874

   21 | 09m59s |   -0.53093 |             0.6183 |    1.9930 |      7.9508 |            28.1155 |      0.9971 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[476]	train-mlogloss:0.375193+0.000551209	test-mlogloss:0.535225+0.00610695

   22 | 06m37s |   -0.53523 |             0.2225 |    0.4305 |      7.9226 |            77.7119 |      0.7058 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[871]	train-mlogloss:0.417301+0.00126282	test-mlogloss:0.53337+0.00595648

   23 | 15m39s |   -0.53337 |             0.6550 |    1.2942 |      4.0978 |            99.9709 |      0.9784 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1014]	train-mlogloss:0.431956+0.00367901	test-mlogloss:0.532465+0.00565556

   24 | 18m26s |   -0.53247 |             0.6665 |    1.9084 |      4.1526 |            67.4150 |      0.9997 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[426]	train-mlogloss:0.381627+0.00219874	test-mlogloss:0.531337+0.00625846

   25 | 14m58s |   -0.53134 |             0.6942 |    1.9010 |      7.5631 |            92.2117 |      0.9938 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[380]	train-mlogloss:0.379907+0.00196816	test-mlogloss:0.530658+0.00540276

   26 | 13m30s |   -0.53066 |             0.6796 |    1.9261 |      7.4059 |            73.1149 |      0.9947 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[237]	train-mlogloss:0.332661+0.00198966	test-mlogloss:0.532112+0.00523573

   27 | 09m34s 

  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[411]	train-mlogloss:0.376898+0.00111545	test-mlogloss:0.531383+0.00598973

   28 | 12m45s |   -0.53138 |             0.6831 |    0.1204 |      6.4247 |            56.0197 |      0.9971 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[319]	train-mlogloss:0.331205+0.0025693	test-mlogloss:0.530266+0.00614409

   29 | 06m27s | [35m  -0.53027[0m | [32m            0.3013[0m | [32m   0.0072[0m | [32m     7.4086[0m | [32m           20.9770[0m | [32m     0.9886[0m | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[297]	train-mlogloss:0.385024+0.00227885	test-mlogloss:0.531139+0.00533732

   30 | 11m03s |   -0.53114 |             0.6648 |    0.3351 |      7.8645 |            66.8106 |      0.9934 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[382]	train-mlogloss:0.386835+0.00153788	test-mlogloss:0.531992+0.00592885

   31 | 13m44s |   -0.53199 |             0.6824 |    0.0176 |      7.6006 |            98.9509 |      0.9977 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[360]	train-mlogloss:0.366949+0.0015266	test-mlogloss:0.529938+0.0053791

   32 | 12m19s | [35m  -0.52994[0m | [32m            0.6176[0m | [32m   1.9456[0m | [32m     7.6936[0m | [32m           51.3186[0m | [32m     0.9972[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[746]	train-mlogloss:0.402632+0.00173635	test-mlogloss:0.533307+0.00612593

   33 | 06m38s |   -0.53331 |             0.2256 |    0.1606 |      4.8848 |            13.8207 |      0.9940 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[805]	train-mlogloss:0.393449+0.00178109	test-mlogloss:0.53184+0.00539254

   34 | 11m33s |   -0.53184 |             0.5071 |    1.9999 |      4.7618 |             0.1091 |      0.9626 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[419]	train-mlogloss:0.320418+0.00184309	test-mlogloss:0.53114+0.00598321

   35 | 04m19s |   -0.53114 |             0.2031 |    1.8755 |      7.9402 |             3.5698 |      0.9687 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[372]	train-mlogloss:0.342411+0.00148167	test-mlogloss:0.531564+0.00581618

   36 | 04m05s |   -0.53156 |             0.2305 |    0.1302 |      7.8416 |            34.0013 |      0.9881 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[584]	train-mlogloss:0.398661+0.00179732	test-mlogloss:0.531492+0.00593788

   37 | 04m36s |   -0.53149 |             0.2160 |    1.7859 |      5.8837 |            24.4480 |      0.9995 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[325]	train-mlogloss:0.368698+0.00219931	test-mlogloss:0.530882+0.00555294

   38 | 07m17s |   -0.53088 |             0.6792 |    0.0168 |      6.9453 |            25.5323 |      0.9705 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[371]	train-mlogloss:0.386683+0.000985384	test-mlogloss:0.531046+0.0058204

   39 | 07m02s |   -0.53105 |             0.6597 |    1.9690 |      6.0117 |            45.5006 |      0.9748 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[575]	train-mlogloss:0.426814+0.00231161	test-mlogloss:0.533574+0.005126

   40 | 07m56s |   -0.53357 |             0.6972 |    1.7498 |      4.0241 |            33.2748 |      0.9919 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[187]	train-mlogloss:0.314608+0.00128621	test-mlogloss:0.532862+0.00630859

   41 | 06m04s | 

  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1093]	train-mlogloss:0.405483+0.00219932	test-mlogloss:0.532698+0.0060757

   42 | 07m00s |   -0.53270 |             0.2071 |    1.9605 |      5.6062 |            89.8575 |      0.9891 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[529]	train-mlogloss:0.363774+0.00110615	test-mlogloss:0.531196+0.00572672

   43 | 05m00s |   -0.53120 |             0.2130 |    1.9776 |      7.9474 |            46.1771 |      0.9781 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[377]	train-mlogloss:0.364472+0.00130237	test-mlogloss:0.530664+0.00512062

   44 | 08m57s |   -0.53066 |             0.6405 |    1.9702 |      7.9757 |            54.6617 |      0.9847 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[417]	train-mlogloss:0.381525+0.00210438	test-mlogloss:0.531444+0.00610889

   45 | 10m07s |   -0.53144 |             0.6949 |    1.9897 |      7.4946 |            85.0715 |      0.9959 | 


In [45]:
## Show tuning results
BO_scores = pd.DataFrame(xgb_BO.res['all']['params'])
BO_scores['score'] = pd.DataFrame(xgb_BO.res['all']['values'])
BO_scores = BO_scores.sort_values(by='score',ascending=False)
BO_scores.head()

Unnamed: 0,colsample_bytree,gamma,max_depth,min_child_weight,subsample,score
26,0.617577,1.945618,7.693556,51.318581,0.997182,-0.529938
23,0.30133,0.007155,7.408636,20.976959,0.988617,-0.530266
12,0.217855,1.97483,7.270835,8.386108,0.995295,-0.530281
10,0.540468,1.905727,7.922502,0.246914,0.939417,-0.530631
20,0.679584,1.92605,7.405918,73.114926,0.994703,-0.530658


### Re-train models

Now we have optimized parameters, let's decrease the size of learning rate and train the model for better results.

Firstly we'll use xgb.cv again to get optimal n_estimators, then we can use tuned n_esimator to finally train the model.

In [46]:
params = dict()
params['objective'] = 'multi:softprob'
params['num_class'] = 3
params['eta'] = 0.01
params['max_depth'] = int(BO_scores.to_dict()['colsample_bytree'][0])
params['min_child_weight'] = BO_scores.to_dict()['min_child_weight'][0]
params['colsample_bytree'] = BO_scores.to_dict()['colsample_bytree'][0]
params['subsample'] = BO_scores.to_dict()['subsample'][0]
params['gamma'] = BO_scores.to_dict()['gamma'][0]
params['seed']=1234

cv_results = xgb.cv(params, xgb.DMatrix(train_x, label=train_y.reshape(train_x.shape[0],1)),
               num_boost_round=1000000, nfold=5,
       metrics={'mlogloss'},
       seed=1234,
       callbacks=[xgb.callback.early_stop(500)])

best_iteration = len(cv_results)


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 500 rounds.
Stopping. Best iteration:
[1974]	train-mlogloss:0.788571+0.00200216	test-mlogloss:0.788627+0.00801569



In [47]:
clf = xgb.XGBClassifier(n_estimators = best_iteration,
                              learning_rate=0.01,
                              max_depth=int(BO_scores.to_dict()['max_depth'][0]),
                              min_child_weight=BO_scores.to_dict()['min_child_weight'][0],
                              colsample_bytree=BO_scores.to_dict()['colsample_bytree'][0],
                              subsample=BO_scores.to_dict()['subsample'][0],
                              gamma=BO_scores.to_dict()['gamma'][0],
                              seed=1234,
                              nthread=-1)

clf.fit(train_x, train_y)

feature_importance = pd.DataFrame(sorted(zip(full_vars,clf.feature_importances_)
                          , key=lambda x: x[1], reverse = True),columns=['feature_name','importance']) 

print (feature_importance.query('importance>0'))


preds = clf.predict_proba(test_x)
sub_df = pd.DataFrame(preds,columns = ["low", "medium", "high"])
sub_df["listing_id"] = test_data.listing_id.values
sub_df.to_csv("../output/sub_xgb_tuned.csv", index=False)

                           feature_name  importance
0                   hcc_manager_id_high    0.047156
1                              latitude    0.045967
2                             longitude    0.044390
3                 hcc_manager_id_medium    0.043440
4                     price_per_bedroom    0.042527
5                        price_per_room    0.040639
6                hcc_building_id_medium    0.038407
7                                 price    0.038387
8                  hcc_building_id_high    0.037708
9           price_percentile_by_manager    0.036636
10                      price_per_photo    0.033802
11         price_percentile_by_building    0.033345
12        price_percentile_by_disp_addr    0.032642
13                         avg_word_len    0.030629
14                           listing_id    0.028094
15                   display_address_le    0.027944
16                       building_id_le    0.027123
17                    street_address_le    0.025959
18          

FileNotFoundError: [Errno 2] No such file or directory: '../output/sub_xgb_tuned.csv'

In [None]:
sub_df.to_csv("sub_xgb_tuned.csv", index=False)