In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv('../input/train_features.csv')

In [3]:
y_train = df_train['is_duplicate'].values

In [5]:
x_train = df_train.drop(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate'], axis=1)

In [6]:
pos_train = x_train[y_train == 1]
neg_train = x_train[y_train == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

x_train = pd.concat([pos_train, neg_train])
y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
del pos_train, neg_train

0.19124366100096607


In [7]:
from sklearn.cross_validation import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2)



In [16]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.03
params['max_depth'] = 9

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, verbose_eval=10)

[0]	train-logloss:0.67711	valid-logloss:0.677159
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.557498	valid-logloss:0.558054
[20]	train-logloss:0.483751	valid-logloss:0.484822
[30]	train-logloss:0.434813	valid-logloss:0.436485
[40]	train-logloss:0.401283	valid-logloss:0.403618
[50]	train-logloss:0.377593	valid-logloss:0.380626
[60]	train-logloss:0.360541	valid-logloss:0.364197
[70]	train-logloss:0.347859	valid-logloss:0.352121
[80]	train-logloss:0.338894	valid-logloss:0.343727
[90]	train-logloss:0.332009	valid-logloss:0.337374
[100]	train-logloss:0.326268	valid-logloss:0.332267
[110]	train-logloss:0.322022	valid-logloss:0.328578
[120]	train-logloss:0.31857	valid-logloss:0.325635
[130]	train-logloss:0.315742	valid-logloss:0.32327
[140]	train-logloss:0.31348	valid-logloss:0.321438
[150]	train-logloss:0.3115	valid-logloss:0.319858
[160]	train-logloss:0.309841	valid

In [9]:
df_test = pd.read_csv('../input/test_features.csv')

In [11]:
test = df_test.drop(['test_id', 'question1', 'question2'], axis=1)

In [17]:
d_test = xgb.DMatrix(test)
p_test = bst.predict(d_test)

In [18]:
probs=np.where(df_test.question1.str.lower() == df_test.question2.str.lower, 1, p_test)

In [19]:
sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = probs
sub.to_csv('../submissions/simple_xgb_v3.csv', index=False)