In [2]:
import gzip
import json

def reader():
    with gzip.open('../text_items.jl.gz', 'rt') as f:
        for line in f:
            item = json.loads(line)
            if item['status'] in {200, 404}:
                yield item

item = next(reader())
item.keys()

dict_keys(['url', 'text', 'status'])

In [3]:
import tldextract

def get_domain(url):
    return tldextract.extract(url).registered_domain.lower()

import numpy as np
import itertools
import random
from collections import Counter

# data = list(itertools.islice(reader(), 3000))
data = list(reader())

domain_counts = Counter(get_domain(item['url']) for item in data)
exclude_most_common = 5
most_common_domains = {domain for domain, _ in domain_counts.most_common(exclude_most_common)}
data = [item for item in data if get_domain(item['url']) not in most_common_domains]

random.shuffle(data)
len(data)

14615

In [4]:
from sklearn.cross_validation import LabelKFold

urls = [item['url'] for item in data]
lkf = LabelKFold(list(map(get_domain, urls)), n_folds=10)

train_idx, test_idx = next(iter(lkf))

In [13]:
def flt_by_indices(items, indices):
    indices = set(indices)
    return [item for idx, item in enumerate(items) if idx in indices]

def write_dataset(filename, data, indices):
    indices = set(indices)
    with open(filename, 'w') as f:
        for item in flt_by_indices(data, indices):
            f.write('__label__{} '.format(item['status']))
            f.write(item['text'])
            f.write('\n')

In [5]:
write_dataset('train.txt', data, train_idx)
write_dataset('test.txt', data, test_idx)

In [12]:
import fasttext
clf = fasttext.supervised('train.txt', 'model')

In [14]:
result = clf.test('test.txt')
print('P@1: {:.2f}'.format(result.precision))
print('R@1: {:.2f}'.format(result.recall))
print('Number of examples: {}'.format(result.nexamples))

P@1: 0.94
R@1: 0.94
Number of examples: 1420


In [29]:
from sklearn import metrics
test_data = flt_by_indices(data, test_idx)
pred_Y = np.array([
        label == ['404'] for label in clf.predict([
            item['text'] for item in test_data])])
test_Y = np.array([item['status'] == 404 for item in test_data])
print(metrics.classification_report(test_Y, pred_Y, target_names=['200', '404']))

             precision    recall  f1-score   support

        200       0.94      1.00      0.97      1237
        404       0.99      0.64      0.78       225

avg / total       0.95      0.94      0.94      1462

