# Tune a combination of word similar features on Dev
number of overlapping unigrams, edit distance

In [55]:
import editdistance
import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize
from joblib import dump, load
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

In [4]:
def load_dev_devtest(path):
    """
    calls word_tokenize
    returns first_lines, second_lines, labels
    """
    first_lines = []
    second_lines = []
    labels = []
    with open(path, 'rt') as f:
        for line in f:
            chunks = line.strip().split('\t')
            first = word_tokenize(chunks[0])
            second = word_tokenize(chunks[1])
            label = int(chunks[2])
            first_lines.append(first)
            second_lines.append(second)
            labels.append(label)
    return first_lines, second_lines, labels

def load_test(path):
    """
    skip id column, calls word_tokenize
    """
    first_lines, second_lines = [], []
    with open(path, 'rt') as f:
        for line in f:
            chunks = line.strip().split('\t')
            first = word_tokenize(chunks[1])
            second = word_tokenize(chunks[2])
            first_lines.append(first)
            second_lines.append(second)
    return first_lines, second_lines

In [31]:
def construct_features(first_lines, second_lines):
    # number of overlapping unigrams, edit distance
    features = np.empty((len(first_lines), 2))
    for idx, tup in enumerate(zip(first_lines, second_lines)):
        first, second = tup
        num_unigrams = len(set(first).intersection(second))
        edit_dist = editdistance.eval(first, second)
        features[idx, 0] = num_unigrams
        features[idx, 1] = edit_dist
    return features

In [5]:
dev = load_dev_devtest('data/dev+devtest/dev.tsv')

In [9]:
dev[0][0], dev[1][0]

(['I', 'do', "n't", 'know', 'what', 'you', "'re", 'sayin', "'", '.'],
 ['I', 'do', "n't", 'know', 'what', 'you', "'re", 'talking', 'about', '.'])

In [11]:
editdistance.eval(dev[0][0], dev[1][0])

2

In [14]:
set(dev[0][0]).intersection(set(dev[1][0]))

{"'re", '.', 'I', 'do', 'know', "n't", 'what', 'you'}

In [12]:
# count number of overlapping unigrams
len(set(dev[0][0]).intersection(set(dev[1][0])))

8

# MLP

In [32]:
X = construct_features(dev[0], dev[1])
y = dev[2]

In [39]:
model = MLPClassifier(max_iter=1000)
model.fit(X, y)

MLPClassifier(max_iter=1000)

In [40]:
y_preds = model.predict(X)
accuracy_score(y, y_preds)

0.8202933985330073

In [41]:
devtest = load_dev_devtest('data/dev+devtest/devtest.tsv')

In [42]:
X_devtest = construct_features(devtest[0], devtest[1])
y_devtest = devtest[2]

In [43]:
y_preds = model.predict(X_devtest)

In [44]:
accuracy_score(y_preds, y_devtest)

0.8

In [52]:
# dump(model, 'output/mlp.joblib') 

['output/mlp.joblib']

In [45]:
test = load_test('data/test_no_labels.tsv')

In [48]:
test[0][0], test[1][0]

(['I', 'love', 'everybody', '.'],
 ['I', 'love', 'you', 'and', 'I', 'love', 'you', '.'])

In [49]:
X_test = construct_features(test[0], test[1])

In [53]:
y_preds = model.predict(X_test)

In [56]:
submission = pd.DataFrame({
    'id': range(len(y_preds)),
    'label': y_preds
})

In [58]:
submission.head()

Unnamed: 0,id,label
0,0,0
1,1,1
2,2,0
3,3,0
4,4,1


In [59]:
submission.to_csv('output/submission.csv', index=False)