In [21]:
from __future__ import print_function, division

import sys
import logging
stdout = sys.stdout

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb

reload(sys)
sys.setdefaultencoding('latin')
sys.stdout = stdout
logging.basicConfig(loglevel="ERROR")

In [22]:
train_data = pd.read_csv("../datasets/train.csv")
test_data = pd.read_csv("../datasets/test_unique.csv")
train_data = train_data.fillna("")
test_data = test_data.fillna("")
print("train data shape:", train_data.shape)
print("test data shape:", test_data.shape)

train data shape: (404290, 6)
test data shape: (2345796, 3)


In [None]:
# nltk.download('stopwords')

In [23]:
eng_stopwords = set(stopwords.words("english"))

In [24]:
def base_score(w1, w2):
    """
    :param w1: list.
    :param w2: list.
    """
    mom = len(w1) + len(w2)
    if mom == 0:
        return 0
    return len(set(w1) & set(w2)) / mom

In [25]:
def tfidf_score(w1, w2):
    return 0

In [31]:
def words_shared_score(row, scoring="base"):
    def tokenize(s):
        return [w for w in s.lower().split() if w not in eng_stopwords]
#         return [w for w in word_tokenize(s.lower()) if w not in eng_stopwords]  #  if w not in eng_stopwords
    
    if row is None:
        return 0
    q1 = row['question1']
    q2 = row['question2']
    w1 = tokenize(q1)
    w2 = tokenize(q2)
    if scoring == "base":
        return base_score(w1, w2)
    if scoring == "tfidf":
        return tfidf_score(w1, w2)
    raise ValueError("scoring must be base or tfidf")

In [32]:
words_shared_score({"question1": "I'm ok", "question2": "not ok"})

0.3333333333333333

In [33]:
X_train = pd.DataFrame()
X_test = pd.DataFrame()
X_train["match_words"] = train_data.apply(words_shared_score, axis=1, raw=True)
X_test["match_words"] = test_data.apply(words_shared_score, axis=1, raw=True)

In [34]:
# pos rate and add sampling
train_pos_rate = train_data.is_duplicate.mean()
test_pos_rate = 0.165
neg_idx = train_data.is_duplicate == 0
neg_data = X_train[neg_idx]
# neg_data = train_data[train_data.is_duplicate == 0]
add_n = int((train_data.shape[0] - neg_data.shape[0]) / test_pos_rate - train_data.shape[0])
print("add n:", add_n)
# train_data_sampling = pd.concat([X_train, neg_data.sample(add_n, replace=True)], axis=0)
# print((train_data.shape[0] - neg_data.shape[0]) / train_data_sampling.shape[0])
X_train = pd.concat([X_train, neg_data.sample(add_n, replace=True)], axis=0)
print((train_data.shape[0] - neg_data.shape[0]) / X_train.shape[0])
y_train = np.array(list(train_data.is_duplicate.values) + [0] * add_n)

add n: 500334
0.165000044217


In [9]:
# X_train = pd.DataFrame()
# X_test = pd.DataFrame()
# X_train["match_words"] = train_data_sampling.apply(words_shared_score, axis=1, raw=True)
# X_test["match_words"] = test_data.apply(words_shared_score, axis=1, raw=True)

In [20]:
pd.concat([X_train.head(), train_data_sampling.head()], axis=1)

Unnamed: 0,match_words,id,qid1,qid2,question1,question2,is_duplicate
0,0.4,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,0.315789,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,0.230769,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,0.05,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,0.157895,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [12]:
'which' in eng_stopwords

True

In [35]:
X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(X_train, y_train, test_size=0.1, 
                                                                  random_state=40, stratify=y_train)

In [36]:
params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "eta": 0.02,
    "max_depth": 4,
    'silent': 1
}
d_train = xgb.DMatrix(X_train_sub, y_train_sub)
d_val = xgb.DMatrix(X_val_sub, y_val_sub)
watch_list = [(d_train, 'train'), (d_val, 'validate')]
print("train on", X_train_sub.shape[0], "samples, validate on", X_val_sub.shape[0], "samples.")
bst = xgb.train(params, d_train, 400, watch_list, early_stopping_rounds=50, verbose_eval=10)

train on 814161 samples, validate on 90463 samples.
[0]	train-logloss:0.682391	validate-logloss:0.682364
Multiple eval metrics have been passed: 'validate-logloss' will be used for early stopping.

Will train until validate-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.594753	validate-logloss:0.594521
[20]	train-logloss:0.533052	validate-logloss:0.532689
[30]	train-logloss:0.488157	validate-logloss:0.487701
[40]	train-logloss:0.454756	validate-logloss:0.45422
[50]	train-logloss:0.429493	validate-logloss:0.428879
[60]	train-logloss:0.410155	validate-logloss:0.409479
[70]	train-logloss:0.395194	validate-logloss:0.39447
[80]	train-logloss:0.383554	validate-logloss:0.382791
[90]	train-logloss:0.374476	validate-logloss:0.373683
[100]	train-logloss:0.36738	validate-logloss:0.366562
[110]	train-logloss:0.361785	validate-logloss:0.360949
[120]	train-logloss:0.357365	validate-logloss:0.356513
[130]	train-logloss:0.353863	validate-logloss:0.353
[140]	train-logloss:0.351082	validate-

In [54]:
print(len(y_train))
print(y_train.sum())
print(len(y_train_sub))
print(y_train_sub.sum())

904624
149263
814161
134337


In [37]:
d_test = xgb.DMatrix(X_test)
p_test = bst.predict(d_test)

sub = pd.DataFrame()
sub['test_id'] = test_data['test_id']
sub['is_duplicate'] = p_test
sub.to_csv("../xgb_shared_words_with_sampling_just_split.csv.gz", index=False, compression="gzip")

In [38]:
sub.shape

(2345796, 2)

In [39]:
sub.head()

Unnamed: 0,test_id,is_duplicate
0,0,0.033063
1,1,0.324146
2,2,0.33176
3,3,0.000723
4,4,0.280286


In [18]:
shape_data = [X_train_sub, X_val_sub, y_train_sub, y_val_sub]
for i in shape_data:
    print(i.shape)

(814161, 1)
(90463, 1)
(814161,)
(90463,)


In [28]:
qs = train_data[4:5].to_dict('records')[0]
print(qs)
words_shared_score(qs)

{'qid2': 10, 'qid1': 9, 'is_duplicate': 0, 'question1': 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?', 'question2': 'Which fish would survive in salt water?', 'id': 4}
['one', 'dissolve', 'water', 'quikly', 'sugar,', 'salt,', 'methane', 'carbon', 'di', 'oxide?']
['fish', 'would', 'survive', 'salt', 'water?']
set([])


0.0

In [19]:
X_train.to_csv('../datasets/ubuntu_shared_word_score.csv', index=False)