In [33]:
import struct
import pickle
        
def reader(indices=None):
    if indices is not None:
        indices = set(indices)
    with open('../text_items.pkls', 'rb') as f:
        idx = 0
        while True:
            size_data = f.read(4)
            if not size_data:
                break
            size, = struct.unpack('i', size_data)
            if indices is None or idx in indices:
                item = pickle.loads(f.read(size))
                item['idx'] = idx
                if item['status'] in {200, 404}:
                    yield item
            else:
                f.seek(f.tell() + size)
            idx += 1

item = next(reader())
item.keys()

dict_keys(['blocks', 'idx', 'status', 'url', 'title', 'text'])

In [3]:
import tldextract

def get_domain(url):
    return tldextract.extract(url).registered_domain.lower()

In [36]:
import numpy as np
import itertools
import random
from collections import Counter

# data = lambda: itertools.islice(data(), 3000)

domain_counts = Counter(get_domain(item['url']) for item in reader())
# TODO - exclude by count, not a fixed number
print(domain_counts.most_common(5))
exclude_most_common = 5
most_common_domains = {domain for domain, _ in domain_counts.most_common(exclude_most_common)}
data = lambda *args, **kwargs: (item for item in reader(*args, **kwargs)
                                if get_domain(item['url']) not in most_common_domains)

[('msn.com', 2382), ('microsoftstore.com', 1080), ('jia360.com', 500), ('nuomi.com', 387), ('tradedoubler.com', 362)]


In [19]:
def get_xy(items, only_ys=False):
    xs = []
    ys = []
    for item in items:
        if not only_ys:
            xs.append(item['text'])
        ys.append(item['status'] == 404)
    ys = np.array(ys)
    return ys if only_ys else (xs, ys)

In [39]:
from sklearn.cross_validation import LabelKFold

urls = [(item['idx'], item['url']) for item in data()]
def to_data_idx(indices):
    indices = set(indices)
    result = [data_idx for idx, (data_idx, _) in enumerate(urls) if idx in indices]
    assert len(result) == len(indices)
    return result

lkf = LabelKFold([get_domain(url) for _, url in urls], n_folds=10)
_train_idx, _test_idx = next(iter(lkf))
train_idx, test_idx = to_data_idx(_train_idx), to_data_idx(_test_idx)
test_X, test_Y = get_xy(data(test_idx))

In [41]:
from sklearn.utils.class_weight import compute_class_weight

classes = [False, True]
print('Test class weight:', compute_class_weight('balanced', classes, test_Y))
np.random.shuffle(train_idx)
class_weight = compute_class_weight('balanced', classes, get_xy(data(train_idx[:1000]), only_ys=True))
print('Train class weight: ', class_weight)
class_weight = dict(zip(classes, class_weight))

Test class weight: [ 0.57600636  3.78919861]
Train class weight:  [ 0.58479532  3.44827586]


In [42]:
from sklearn.feature_extraction.text import CountVectorizer

#vect2 = CountVectorizer(ngram_range=(1, 2), vocabulary={w for f in good_features for w in f.split()})
vect = CountVectorizer(ngram_range=(1, 1))
vect.fit(item['text'] for item in data(train_idx))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [12]:
def batches(lst, size):
    for idx in range(0, len(lst), size):
        yield lst[idx:idx + size]

In [43]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss='log', class_weight=None, penalty='l1')

n_epochs = 2
batch_size = 5000
for epoch in range(n_epochs):
    print('\nEpoch {} '.format(epoch + 1), end='', flush=True)
    np.random.shuffle(train_idx)
    for indices in batches(train_idx, batch_size):
        print('.', end='', flush=True)
        _x, _y = get_xy(data(indices))
        clf.partial_fit(vect.transform(_x), _y, classes=classes)


Epoch 1 ....
Epoch 2 ....

In [44]:
from sklearn import metrics
pred_Y = clf.predict(vect.transform(test_X))
print(metrics.classification_report(test_Y, pred_Y, target_names=['200', '404']))

             precision    recall  f1-score   support

        200       0.95      0.98      0.97      1888
        404       0.83      0.69      0.76       287

avg / total       0.94      0.94      0.94      2175



In [45]:
print('ROC AUC: {:.3f}'.format(metrics.roc_auc_score(test_Y, clf.predict_proba(vect.transform(test_X))[:,1])))

ROC AUC: 0.839


  np.exp(prob, prob)


In [14]:
def show_features(clf, vect, limit=20):
    coef = list(enumerate(clf.coef_[0]))
    coef.sort(key=lambda x: x[1], reverse=True)
    print('{} non-zero features, {} positive and {} negative:'.format(
            sum(abs(v) > 0 for _, v in coef),
            sum(v > 0 for _, v in coef),
            sum(v < 0 for _, v in coef),
        ))
    inverse = {idx: word for word, idx in vect.vocabulary_.items()}
    print()
    for idx, c in coef[:limit]:
        print('%.3f %s' % (c, inverse[idx]))
    print('...')
    for idx, c in coef[-limit:]:
        print('%.3f %s' % (c, inverse[idx]))
    return coef, inverse

In [None]:
coef, inverse = show_features(clf, vect)

In [None]:
good_features = [inverse[idx] for idx, weight in coef if weight > 10]
len(good_features), good_features[:10]

In [None]:
false_neg = (pred_Y != test_Y) & (test_Y == True)
false_pos = (pred_Y != test_Y) & (pred_Y == True)
print('404 classified as 200: {}'.format(false_neg.sum()))
print('200 classified as 404: {}'.format(false_pos.sum()))

In [None]:
test_urls = np.array([item['url'] for item in data(test_idx)])

In [None]:
for url in test_urls[false_neg]:
    print(url)

In [None]:
for url in test_urls[false_pos]:
    print(url)