In [21]:
from soft404.utils import pickle_stream_reader
        
def reader(indices=None):
    with open('../text_items_small.pkls', 'rb') as f:
        for idx, item in pickle_stream_reader(f, indices):
            item['idx'] = idx
            if item['status'] in {200, 404}:
                yield item
        
item = next(reader())
item.keys()

dict_keys(['text', 'blocks', 'lang', 'url', 'idx', 'title', 'status'])

In [22]:
import tldextract

def get_domain(url):
    return tldextract.extract(url).registered_domain.lower()

In [23]:
import numpy as np
import itertools
import random
from collections import Counter

domain_counts = Counter(get_domain(item['url']) for item in reader())
# TODO - exclude by count, not a fixed number
exclude_most_common = 5
exclude_domains = {domain for domain, _ in domain_counts.most_common(exclude_most_common)}
domain_indices = {item['idx'] for item in reader()
                  if get_domain(item['url']) not in exclude_domains}
domain_counts.most_common(10)

[('msn.com', 2382),
 ('microsoftstore.com', 1080),
 ('jia360.com', 500),
 ('nuomi.com', 387),
 ('tradedoubler.com', 362),
 ('stackoverflow.com', 360),
 ('weibo.com', 312),
 ('ebay.com', 293),
 ('fc2.com', 279),
 ('amazon.in', 278)]

In [24]:
langs = [(item['idx'], item['lang']) for item in reader()]
for lang, count in Counter(lang for _, lang in langs).most_common(10):
    print('{:>10} {}'.format(count, lang))
lang_indices = {idx for idx, lang in langs if lang == 'en'}

     14703 en
      5114 zh-cn
      1620 ja
      1289 ko
       558 ru
       450 
       419 es
       411 fr
       399 it
       367 de


In [25]:
flt_indices = lang_indices & domain_indices
def data(indices=None):
    if indices is not None:
        indices = set(indices) & flt_indices
    else:
        indices = flt_indices
    yield from reader(indices)

In [26]:
def get_xy(items, only_ys=False):
    xs = []
    ys = []
    for item in items:
        if not only_ys:
            xs.append(item['text'])
        ys.append(item['status'] == 404)
    ys = np.array(ys)
    return ys if only_ys else (xs, ys)

In [27]:
from sklearn.cross_validation import LabelKFold

urls = [(item['idx'], item['url']) for item in data()]
def to_data_idx(indices):
    indices = set(indices)
    result = [data_idx for idx, (data_idx, _) in enumerate(urls) if idx in indices]
    assert len(result) == len(indices)
    return result

lkf = LabelKFold([get_domain(url) for _, url in urls], n_folds=10)
_train_idx, _test_idx = next(iter(lkf))
train_idx, test_idx = to_data_idx(_train_idx), to_data_idx(_test_idx)
test_X, test_Y = get_xy(data(test_idx))
print(len(train_idx), len(test_idx))

9789 1088


In [28]:
Counter(get_domain(item['url']) for item in data(train_idx)).most_common(10)

[('ebay.com', 293),
 ('amazon.in', 278),
 ('fc2.com', 252),
 ('netflix.com', 196),
 ('goo.gl', 187),
 ('ebay.in', 175),
 ('ebay.co.uk', 164),
 ('ebay.ca', 160),
 ('ebay.com.my', 159),
 ('ebay.com.au', 155)]

In [29]:
Counter(get_domain(item['url']) for item in data(test_idx)).most_common(10)

[('stackoverflow.com', 360),
 ('ebay.com.hk', 76),
 ('ebayinc.com', 60),
 ('wonderwall.com', 49),
 ('amazon.ca', 40),
 ('arstechnica.com', 33),
 ('reuters.com', 29),
 ('businessinsider.com', 26),
 ('delish.com', 24),
 ('mozilla.org', 22)]

In [30]:
from sklearn.utils.class_weight import compute_class_weight

classes = [False, True]
print('Test class weight:', compute_class_weight('balanced', classes, test_Y))
np.random.shuffle(train_idx)
class_weight = compute_class_weight('balanced', classes, get_xy(data(train_idx[:1000]), only_ys=True))
print('Train class weight: ', class_weight)
class_weight = dict(zip(classes, class_weight))

Test class weight: [ 0.57749469  3.7260274 ]
Train class weight:  [ 0.60459492  2.89017341]


In [31]:
from sklearn.feature_extraction.text import CountVectorizer

#vect2 = CountVectorizer(ngram_range=(1, 2), vocabulary={w for f in good_features for w in f.split()})
vect = CountVectorizer(ngram_range=(1, 1))
vect.fit(item['text'] for item in data(train_idx))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [32]:
from sklearn.linear_model import SGDClassifier
from soft404.utils import batches

clf = SGDClassifier(loss='log', class_weight=None, penalty='l1')

n_epochs = 2
batch_size = 5000
for epoch in range(n_epochs):
    print('\nEpoch {} '.format(epoch + 1), end='', flush=True)
    np.random.shuffle(train_idx)
    for indices in batches(train_idx, batch_size):
        print('.', end='', flush=True)
        _x, _y = get_xy(data(indices))
        clf.partial_fit(vect.transform(_x), _y, classes=classes)


Epoch 1 ..
Epoch 2 ..

In [33]:
from sklearn import metrics
pred_Y = clf.predict(vect.transform(test_X))
print(metrics.classification_report(test_Y, pred_Y, target_names=['200', '404']))

             precision    recall  f1-score   support

        200       0.99      0.95      0.97       942
        404       0.75      0.95      0.84       146

avg / total       0.96      0.95      0.95      1088



In [34]:
print('ROC AUC: {:.3f}'.format(metrics.roc_auc_score(test_Y, clf.predict_proba(vect.transform(test_X))[:,1])))

ROC AUC: 0.951


  np.exp(prob, prob)


In [35]:
for lang, count in Counter(test_langs).most_common(5):
    print('\nLanguage "{}" with {} test samples:'.format(lang, count))
    lang_idx = test_langs == lang
    print(metrics.classification_report(test_Y[lang_idx], pred_Y[lang_idx], target_names=['200', '404']))

NameError: name 'test_langs' is not defined

In [None]:
def show_features(clf, vect, limit=20):
    coef = list(enumerate(clf.coef_[0]))
    coef.sort(key=lambda x: x[1], reverse=True)
    print('{} non-zero features, {} positive and {} negative:'.format(
            sum(abs(v) > 0 for _, v in coef),
            sum(v > 0 for _, v in coef),
            sum(v < 0 for _, v in coef),
        ))
    inverse = {idx: word for word, idx in vect.vocabulary_.items()}
    print()
    for idx, c in coef[:limit]:
        print('%.3f %s' % (c, inverse[idx]))
    print('...')
    for idx, c in coef[-limit:]:
        print('%.3f %s' % (c, inverse[idx]))
    return coef, inverse

In [None]:
coef, inverse = show_features(clf, vect)

In [None]:
good_features = [inverse[idx] for idx, weight in coef if weight > 10]
len(good_features), good_features[:10]

In [None]:
false_neg = (pred_Y != test_Y) & (test_Y == True)
false_pos = (pred_Y != test_Y) & (pred_Y == True)
print('404 classified as 200: {}'.format(false_neg.sum()))
print('200 classified as 404: {}'.format(false_pos.sum()))

In [None]:
test_urls = np.array([item['url'] for item in data(test_idx)])

In [None]:
for url in test_urls[false_neg]:
    print(url)

In [None]:
for url in test_urls[false_pos]:
    print(url)