In [1]:
import gzip
import json

def reader():
    with gzip.open('../text_items.jl.gz', 'rt') as f:
        for line in f:
            item = json.loads(line)
            if item['status'] in {200, 404}:
                yield item

item = next(reader())
item.keys()

dict_keys(['url', 'text', 'status'])

In [2]:
import tldextract

def get_domain(url):
    return tldextract.extract(url).registered_domain.lower()

In [3]:
import numpy as np
import itertools
import random
from collections import Counter

# data = list(itertools.islice(reader(), 3000))
data = list(reader())

domain_counts = Counter(get_domain(item['url']) for item in data)
exclude_most_common = 5
most_common_domains = {domain for domain, _ in domain_counts.most_common(exclude_most_common)}
data = [item for item in data if get_domain(item['url']) not in most_common_domains]

random.shuffle(data)
len(data)

14615

In [4]:
def get_xy(items, indices=None):
    if indices is not None:
        indices = set(indices)
        items = [item for idx, item in enumerate(items) if idx in indices]
    return (np.array([item['text'] for item in items]),
            np.array([item['status'] == 404 for item in items]))

In [5]:
from sklearn.cross_validation import LabelKFold

urls = [item['url'] for item in data]
lkf = LabelKFold(list(map(get_domain, urls)), n_folds=10)

train_idx, test_idx = next(iter(lkf))
train_idx_set = set(train_idx)
test_X, test_Y = get_xy(data, test_idx)
len(train_idx), len(test_idx)

(13153, 1462)

In [7]:
from sklearn.utils.class_weight import compute_class_weight

classes = [False, True]
print('Train class weight:', compute_class_weight('balanced', classes, test_Y))
np.random.shuffle(train_idx)
class_weight = compute_class_weight('balanced', classes, get_xy(data, train_idx[:1000])[1])
print('Test class weight: ', class_weight)
class_weight = dict(zip(classes, class_weight))

Train class weight: [ 0.59094584  3.24888889]
Test class weight:  [ 0.60096154  2.97619048]


In [13]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(ngram_range=(1, 1))
vect.fit(item['text'] for idx, item in enumerate(data) if idx in train_idx_set)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [9]:
def batches(lst, size):
    for idx in range(0, len(lst), size):
        yield lst[idx:idx + size]

In [15]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss='log', class_weight=None, penalty='l1')

n_epochs = 2
batch_size = 1000
for epoch in range(n_epochs):
    print('\nEpoch {} '.format(epoch + 1), end='', flush=True)
    np.random.shuffle(train_idx)
    for indices in batches(train_idx, batch_size):
        print('.', end='', flush=True)
        _x, _y = get_xy(data, indices)
        clf.partial_fit(vect.transform(_x), _y, classes=classes)


Epoch 1 ..............
Epoch 2 ..............

In [16]:
from sklearn import metrics
print(metrics.classification_report(test_Y, clf.predict(vect.transform(test_X)), target_names=['200', '404']))

             precision    recall  f1-score   support

        200       0.96      0.99      0.97      1237
        404       0.92      0.78      0.85       225

avg / total       0.96      0.96      0.95      1462



In [17]:
print('ROC AUC: {:.3f}'.format(metrics.roc_auc_score(test_Y, clf.predict_proba(vect.transform(test_X))[:,1])))

ROC AUC: 0.905


  np.exp(prob, prob)
