In [1]:
import numpy as np
import pandas as pd

## Read Data ##

In [2]:
df_train = pd.read_csv('ptrain.csv', encoding='utf-8')
df_train['id'] = df_train['id'].apply(str)

In [3]:
df_test = pd.read_csv('ptest.csv', encoding='utf-8')
df_test['test_id'] = df_test['test_id'].apply(str)

In [4]:
df_all = pd.concat((df_train, df_test))
df_all['question1'].fillna('', inplace=True)
df_all['question2'].fillna('', inplace=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [5]:
df_all.to_csv("concat.csv")

## Create Vocab ##

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
import itertools

In [7]:
counts_vectorizer = CountVectorizer(max_features=10000-1).fit( \
                    itertools.chain(df_all['question1'], df_all['question2']))
other_index = len(counts_vectorizer.vocabulary_)

##Prep Data##

In [8]:
import re
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [9]:
words_tokenizer = re.compile(counts_vectorizer.token_pattern)

In [10]:
def create_padded_seqs(texts, max_len=10):
    seqs = texts.apply(lambda s: 
        [counts_vectorizer.vocabulary_[w] if w in counts_vectorizer.vocabulary_ else other_index
         for w in words_tokenizer.findall(s.lower())])
    return pad_sequences(seqs, maxlen=max_len)

In [11]:
X1_train, X1_val, X2_train, X2_val, y_train, y_val = \
    train_test_split(create_padded_seqs(df_all[df_all['id'].notnull()]['question1']), 
                     create_padded_seqs(df_all[df_all['id'].notnull()]['question2']),
                     df_all[df_all['id'].notnull()]['is_duplicate'].values,
                     stratify=df_all[df_all['id'].notnull()]['is_duplicate'].values,
                     test_size=0.3, random_state=1989)

##Training##

In [12]:
import keras.layers as lyr
from keras.models import Model

In [13]:
input1_tensor = lyr.Input(X1_train.shape[1:])
input2_tensor = lyr.Input(X2_train.shape[1:])

words_embedding_layer = lyr.Embedding(X1_train.max() + 1, 100)
seq_embedding_layer = lyr.LSTM(256, activation='tanh')

seq_embedding = lambda tensor: seq_embedding_layer(words_embedding_layer(tensor))

merge_layer = lyr.multiply([seq_embedding(input1_tensor), seq_embedding(input2_tensor)])

dense1_layer = lyr.Dense(16, activation='sigmoid')(merge_layer)
ouput_layer = lyr.Dense(1, activation='sigmoid')(dense1_layer)

model = Model([input1_tensor, input2_tensor], ouput_layer)

model.compile(loss='binary_crossentropy', optimizer='adam')
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 10, 100)      1000000     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None

In [14]:
model.fit([X1_train, X2_train], y_train, 
          validation_data=([X1_val, X2_val], y_val), 
          batch_size=128, epochs=1, verbose=2)

Instructions for updating:
Use tf.cast instead.
Train on 283000 samples, validate on 121287 samples
Epoch 1/1
 - 294s - loss: 0.4984 - val_loss: 0.4605


<keras.callbacks.History at 0x7f8e797ef7f0>

##Extract Features From Model##

In [15]:
features_model = Model([input1_tensor, input2_tensor], merge_layer)
features_model.compile(loss='mse', optimizer='adam')

In [16]:
F_train = features_model.predict([X1_train, X2_train], batch_size=128)
F_val = features_model.predict([X1_val, X2_val], batch_size=128)

##Train XGBoost##

In [17]:
import xgboost as xgb

In [18]:
dTrain = xgb.DMatrix(F_train, label=y_train)
dVal = xgb.DMatrix(F_val, label=y_val)

In [19]:
xgb_params = {
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'eval_metric': 'logloss',
    'eta': 0.1, 
    'max_depth': 9,
    'subsample': 0.9,
    'colsample_bytree': 1 / F_train.shape[1]**0.5,
    'min_child_weight': 5,
    'silent': 1
}
bst = xgb.train(xgb_params, dTrain, 1000,  [(dTrain,'train'), (dVal,'val')], 
                verbose_eval=10, early_stopping_rounds=10)

[0]	train-logloss:0.653118	val-logloss:0.656381
Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.

Will train until val-logloss hasn't improved in 10 rounds.
[10]	train-logloss:0.469503	val-logloss:0.496619
[20]	train-logloss:0.418184	val-logloss:0.460799
[30]	train-logloss:0.398081	val-logloss:0.452042
[40]	train-logloss:0.387269	val-logloss:0.449608
[50]	train-logloss:0.37948	val-logloss:0.448558
[60]	train-logloss:0.373128	val-logloss:0.447942
[70]	train-logloss:0.367407	val-logloss:0.447252
[80]	train-logloss:0.36202	val-logloss:0.44676
[90]	train-logloss:0.357376	val-logloss:0.446459
[100]	train-logloss:0.352835	val-logloss:0.446073
[110]	train-logloss:0.349039	val-logloss:0.44574
[120]	train-logloss:0.345726	val-logloss:0.445452
[130]	train-logloss:0.341884	val-logloss:0.445249
[140]	train-logloss:0.33845	val-logloss:0.445006
[150]	train-logloss:0.335255	val-logloss:0.444818
[160]	train-logloss:0.332124	val-logloss:0.44464
[170]	train-logloss:

##Predict Test##

In [20]:
X1_test = create_padded_seqs(df_all[df_all['test_id'].notnull()]['question1'])
X2_test = create_padded_seqs(df_all[df_all['test_id'].notnull()]['question2'])

In [21]:
F_test = features_model.predict([X1_test, X2_test], batch_size=128)

In [22]:
dTest = xgb.DMatrix(F_test)

In [23]:
df_sub = pd.DataFrame({
        'test_id': df_all[df_all['test_id'].notnull()]['test_id'].values,
        'is_duplicate': bst.predict(dTest, ntree_limit=bst.best_ntree_limit)
    }).set_index('test_id')

In [24]:
df_sub.head()

Unnamed: 0_level_0,is_duplicate
test_id,Unnamed: 1_level_1
0,0.119859
1,0.044689
2,0.667651
3,0.571104
4,0.423248


In [26]:
df1 = pd.read_csv("submissions.csv")
df1 = df1.iloc[:200000,:]
df1 = df1.rename(index=str, columns={"test_id": "test_id", "is_duplicate": "target"})
df1.head()

Unnamed: 0,test_id,target
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


In [29]:
predicted = [0 if x < 0.05 else 1 for x in df_sub["is_duplicate"].values]
actual = df1["target"].values
res = [1 for i in range(len(predicted)) if predicted[i] == actual[i]]
print(np.sum(res)/len(predicted))

0.693535


In [31]:
def logloss(ptest):
    s = 0
    for res in ptest:
        s+=np.log(res)
    return -s

print(logloss(df_sub["is_duplicate"])/len(df_sub["is_duplicate"]))

2.208124678164954
