# Tune a combination of word similar features on Dev
number of overlapping unigrams, edit distance

In [55]:
import editdistance
import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize
from joblib import dump, load
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

In [71]:
def load_dev_devtest(path):
    """
    calls word_tokenize
    returns first_lines, second_lines, labels
    """
    first_lines = []
    second_lines = []
    labels = []
    with open(path, 'rt') as f:
        for line in f:
            chunks = line.strip().split('\t')
            first = word_tokenize(chunks[0])
            second = word_tokenize(chunks[1])
            label = int(chunks[2])
            first_lines.append(first)
            second_lines.append(second)
            labels.append(label)
    return first_lines, second_lines, labels

def load_test(path):
    """
    skip id column, calls word_tokenize
    """
    first_lines, second_lines = [], []
    with open(path, 'rt') as f:
        for line in f:
            chunks = line.strip().split('\t')
            first = word_tokenize(chunks[1])
            second = word_tokenize(chunks[2])
            first_lines.append(first)
            second_lines.append(second)
    return first_lines, second_lines

def load_test_hard(path):
    """
    skip id column, calls word_tokenize
    """
    first_lines, second_lines = [], []
    with open(path, 'rt') as f:
        for line in f:
            chunks = line.strip().split('\t')
            first = word_tokenize(chunks[0])
            second = word_tokenize(chunks[1])
            first_lines.append(first)
            second_lines.append(second)
    return first_lines, second_lines

In [31]:
def construct_features(first_lines, second_lines):
    # number of overlapping unigrams, edit distance
    features = np.empty((len(first_lines), 2))
    for idx, tup in enumerate(zip(first_lines, second_lines)):
        first, second = tup
        num_unigrams = len(set(first).intersection(second))
        edit_dist = editdistance.eval(first, second)
        features[idx, 0] = num_unigrams
        features[idx, 1] = edit_dist
    return features

In [5]:
dev = load_dev_devtest('data/dev+devtest/dev.tsv')

In [9]:
dev[0][0], dev[1][0]

(['I', 'do', "n't", 'know', 'what', 'you', "'re", 'sayin', "'", '.'],
 ['I', 'do', "n't", 'know', 'what', 'you', "'re", 'talking', 'about', '.'])

In [11]:
editdistance.eval(dev[0][0], dev[1][0])

2

In [14]:
set(dev[0][0]).intersection(set(dev[1][0]))

{"'re", '.', 'I', 'do', 'know', "n't", 'what', 'you'}

In [12]:
# count number of overlapping unigrams
len(set(dev[0][0]).intersection(set(dev[1][0])))

8

# MLP

In [32]:
X = construct_features(dev[0], dev[1])
y = dev[2]

In [39]:
model = MLPClassifier(max_iter=1000)
model.fit(X, y)

MLPClassifier(max_iter=1000)

In [40]:
y_preds = model.predict(X)
accuracy_score(y, y_preds)

0.8202933985330073

In [41]:
devtest = load_dev_devtest('data/dev+devtest/devtest.tsv')

In [42]:
X_devtest = construct_features(devtest[0], devtest[1])
y_devtest = devtest[2]

In [43]:
y_preds = model.predict(X_devtest)

In [44]:
accuracy_score(y_preds, y_devtest)

0.8

In [52]:
# dump(model, 'output/mlp.joblib') 

['output/mlp.joblib']

In [45]:
test = load_test('data/test_no_labels.tsv')

In [48]:
test[0][0], test[1][0]

(['I', 'love', 'everybody', '.'],
 ['I', 'love', 'you', 'and', 'I', 'love', 'you', '.'])

In [49]:
X_test = construct_features(test[0], test[1])

In [53]:
y_preds = model.predict(X_test)

In [60]:
submission = pd.DataFrame({
    'id': range(len(y_preds)),
    'Category': y_preds
})

In [61]:
submission.head()

Unnamed: 0,id,Category
0,0,0
1,1,1
2,2,0
3,3,0
4,4,1


In [62]:
# submission.to_csv('output/submission.csv', index=False)

In [63]:
devhard = load_dev_devtest('data/heldout-hard/dev.hard.tsv')

In [65]:
X = construct_features(devhard[0], devhard[1])
y = devhard[2]
model = MLPClassifier(max_iter=1000)
model.fit(X, y)
y_preds = model.predict(X)
accuracy_score(y, y_preds)

0.671

In [66]:
devtest = load_dev_devtest('data/heldout-hard/devtest.hard.tsv')
X_devtest = construct_features(devtest[0], devtest[1])
y_devtest = devtest[2]
y_preds = model.predict(X_devtest)
accuracy_score(y_preds, y_devtest)

0.655

In [69]:
!head data/heldout-hard/test.hard.tsv

The film is about Rafael , a Malayalam singer in new film industry .	This film is about Rafael , a new singer in the Malayalam film industry .
Dave Tomar is the pseudonym of Ed Dante , a graduate of Rutgers , now a freelance author in Philadelphia .	Ed Dante is the pseudonym of Dave Tomar , a graduate of Rutgers , who now lives as a freelance writer in Philadelphia .
A UK tour started in May 1983 , featuring live guitarist Robin George for additional performances .	In May 1983 , a UK tour with the additional guitarist Robin George started for live performances .
Although the Rugby - Union in Slovenia was the main centre for sport in the former Yugoslavia - a lot of rugby was still being played in Croatia .	Although rugby union in Slovenia was the main centre for the sport in the former Yugoslavia , there was still quite a bit of rugby played in Croatia .
For several generations , the Barons de Longueuil have not lived in Canada , but in Scotland and France , and in the 1970s in Luz

In [70]:
with open('data/heldout-hard/test.hard.tsv', 'rt') as f:
    for line in f:
        assert(len(line.strip().split('\t')) == 2)

In [72]:
test = load_test_hard('data/heldout-hard/test.hard.tsv')
X_test = construct_features(test[0], test[1])
y_preds = model.predict(X_test)
submission = pd.DataFrame({
    'id': range(len(y_preds)),
    'Category': y_preds
})

In [73]:
submission.head()

Unnamed: 0,id,Category
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [74]:
submission.Category.value_counts()

0    2273
1     727
Name: Category, dtype: int64

In [75]:
submission.to_csv('output/submission-hard.csv', index=False)

In [76]:
dump(model, 'output/mlp-hard.joblib') 

['output/mlp-hard.joblib']