In [6]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split

In [2]:
train_df = pd.read_pickle('dataframe')

In [7]:
features = ["unigrams_common_count", "unigrams_common_ratio",
            "bigrams_common_count", "bigrams_common_ratio",
            "trigrams_common_count", "trigrams_common_ratio",
            "nouns_common_ratio",
            "word_match", "tfidf_train_word_match"]

x_train = train_df[features]
y_train = train_df['is_duplicate'].values

In [8]:
pos_train = x_train[y_train == 1]
neg_train = x_train[y_train == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

x_train = pd.concat([pos_train, neg_train])
y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
del pos_train, neg_train

0.19124366100096607


In [9]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242)

In [11]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.1
params['max_depth'] = 4

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, verbose_eval=10)

[0]	train-logloss:0.644321	valid-logloss:0.64449
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.432023	valid-logloss:0.433097
[20]	train-logloss:0.37802	valid-logloss:0.379532
[30]	train-logloss:0.361233	valid-logloss:0.362866
[40]	train-logloss:0.355289	valid-logloss:0.356974
[50]	train-logloss:0.352743	valid-logloss:0.354466
[60]	train-logloss:0.351296	valid-logloss:0.353107
[70]	train-logloss:0.350149	valid-logloss:0.352081
[80]	train-logloss:0.349152	valid-logloss:0.351213
[90]	train-logloss:0.348224	valid-logloss:0.350381
[100]	train-logloss:0.347553	valid-logloss:0.349785
[110]	train-logloss:0.346749	valid-logloss:0.349103
[120]	train-logloss:0.346362	valid-logloss:0.348787
[130]	train-logloss:0.345839	valid-logloss:0.348366
[140]	train-logloss:0.345356	valid-logloss:0.34795
[150]	train-logloss:0.3449	valid-logloss:0.347576
[160]	train-logloss:0.344519	vali

In [14]:
bst.save_model('valid-logloss:0.340239.model')