In [25]:
import gzip
import ujson as json

def reader(indices=None):
    if indices is not None:
        indices = set(indices)
    with gzip.open('../text_items.jl.gz', 'rt') as f:
        for idx, line in enumerate(f):
            if indices is None or idx in indices:
                item = json.loads(line)
                item['idx'] = idx
                if item['status'] in {200, 404}:
                    yield item

item = next(reader())
item.keys()

dict_keys(['url', 'blocks', 'status', 'title', 'idx', 'text'])

In [2]:
import tldextract

def get_domain(url):
    return tldextract.extract(url).registered_domain.lower()

In [19]:
import numpy as np
import itertools
import random
from collections import Counter

# data = lambda: itertools.islice(data(), 3000)

domain_counts = Counter(get_domain(item['url']) for item in reader())
# TODO - exclude by count, not a fixed number
exclude_most_common = 5
most_common_domains = {domain for domain, _ in domain_counts.most_common(exclude_most_common)}
data = lambda *args, **kwargs: (item for item in reader(*args, **kwargs)
                                if get_domain(item['url']) not in most_common_domains)

In [24]:
def get_xy(items, only_ys=False):
    xs = []
    ys = []
    for item in items:
        if not only_ys:
            xs.append(item['text'])
        ys.append(item['status'] == 404)
    return np.array(ys) if only_ys else (np.array(xs), np.array(ys))

In [26]:
from sklearn.cross_validation import LabelKFold

urls = [(item['idx'], item['url']) for item in data()]
def to_data_idx(indices):
    indices = set(indices)
    result = [data_idx for idx, (data_idx, _) in enumerate(urls) if idx in indices]
    assert len(result) == len(indices)
    return result

lkf = LabelKFold([get_domain(url) for _, url in urls], n_folds=10)
_train_idx, _test_idx = next(iter(lkf))
train_idx, test_idx = to_data_idx(_train_idx), to_data_idx(_test_idx)

test_X, test_Y = get_xy(data(test_idx))
assert len(test_X) == len(test_idx) == len(_test_idx)

In [27]:
from sklearn.utils.class_weight import compute_class_weight

classes = [False, True]
print('Train class weight:', compute_class_weight('balanced', classes, test_Y))
np.random.shuffle(train_idx)
class_weight = compute_class_weight('balanced', classes, get_xy(data(train_idx[:1000]), only_ys=True))
print('Test class weight: ', class_weight)
class_weight = dict(zip(classes, class_weight))

Train class weight: [ 0.59094584  3.24888889]
Test class weight:  [ 0.58548009  3.42465753]


In [28]:
from sklearn.feature_extraction.text import CountVectorizer

#vect2 = CountVectorizer(ngram_range=(1, 2), vocabulary={w for f in good_features for w in f.split()})
vect = CountVectorizer(ngram_range=(1, 1))
vect.fit(item['text'] for item in data(train_idx))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [16]:
def batches(lst, size):
    for idx in range(0, len(lst), size):
        yield lst[idx:idx + size]

In [31]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss='log', class_weight=None, penalty='l1')

n_epochs = 2
batch_size = 1000
for epoch in range(n_epochs):
    print('\nEpoch {} '.format(epoch + 1), end='', flush=True)
    np.random.shuffle(train_idx)
    for indices in batches(train_idx, batch_size):
        print('.', end='', flush=True)
        _x, _y = get_xy(data(indices))
        clf.partial_fit(vect.transform(_x), _y, classes=classes)


Epoch 1 ..............
Epoch 2 ..............

In [32]:
from sklearn import metrics
pred_Y = clf.predict(vect.transform(test_X))
print(metrics.classification_report(test_Y, pred_Y, target_names=['200', '404']))

             precision    recall  f1-score   support

        200       0.94      0.97      0.95      1237
        404       0.80      0.64      0.71       225

avg / total       0.92      0.92      0.92      1462



In [33]:
print('ROC AUC: {:.3f}'.format(metrics.roc_auc_score(test_Y, clf.predict_proba(vect.transform(test_X))[:,1])))

ROC AUC: 0.823


  np.exp(prob, prob)


In [61]:
def show_features(clf, vect, limit=20):
    coef = list(enumerate(clf.coef_[0]))
    coef.sort(key=lambda x: x[1], reverse=True)
    print('{} non-zero features, {} positive and {} negative:'.format(
            sum(abs(v) > 0 for _, v in coef),
            sum(v > 0 for _, v in coef),
            sum(v < 0 for _, v in coef),
        ))
    inverse = {idx: word for word, idx in vect.vocabulary_.items()}
    print()
    for idx, c in coef[:limit]:
        print('%.3f %s' % (c, inverse[idx]))
    print('...')
    for idx, c in coef[-limit:]:
        print('%.3f %s' % (c, inverse[idx]))
    return coef, inverse

In [88]:
coef, inverse = show_features(clf, vect)

71406 non-zero features, 22847 positive and 48559 negative:

636.562 found
602.832 not
452.300 page
366.914 nachrichten
360.505 il
358.560 travel
352.364 404
280.177 adobe
258.355 telebörse
249.072 jours
244.326 min
233.400 looking
215.657 find
215.395 mehr
202.451 pop
187.943 url
185.802 von
185.121 auf
185.099 contact
180.163 account
...
-341.166 01日
-345.654 00
-347.092 予約購入済み
-348.039 vote
-349.115 chicago
-368.331 ios
-370.365 pinterest
-377.602 as
-395.452 aug
-408.616 000원
-414.562 立即下载
-434.544 idea
-442.318 北京
-448.117 tibet
-644.272 english
-726.400 帖数
-726.612 最后发表
-743.136 主题
-1059.492 yahoo
-3664.856 ideas


In [89]:
good_features = [inverse[idx] for idx, weight in coef if weight > 10]
len(good_features), good_features[:10]

(2796,
 ['found',
  'not',
  'page',
  'nachrichten',
  'il',
  'travel',
  '404',
  'adobe',
  'telebörse',
  'jours'])

In [34]:
false_neg = (pred_Y != test_Y) & (test_Y == True)
false_pos = (pred_Y != test_Y) & (pred_Y == True)
print('404 classified as 200: {}'.format(false_neg.sum()))
print('200 classified as 404: {}'.format(false_pos.sum()))

404 classified as 200: 81
200 classified as 404: 37


In [35]:
test_urls = np.array([item['url'] for item in data(test_idx)])

In [40]:
for url in test_urls[false_neg]:
    print(url)

https://www.youtube.com/channel/UCOpNcN46UbXVtpKMrmrU4Abg
https://www.youtube.com/channel/UC7DWJmY_p7qLzIy2-V7m7U5Q
https://www.youtube.com/channel/UCYfdidRxbB8Qhf0Nx7ikoOYw
https://www.youtube.com/channel/UC_8PAD0Qmi6_gpe77S1Atsgg
http://playexam.blog.sohu.com/3u22658591.html
http://guangyuanma.blog.sohu.com/3b22674921.html
http://zglbp.blog.sohu.com/3l22672349.html
http://xiaotao2006.blog.sohu.com/3226n95106.html
http://qiulin2011.blog.sohu.com/32267317o8.html
http://vm.ru/news/2016/09/02/v-rezultate-stolknoveniya-inomarok-na-kutuzovskom-prospekte-pogib-odin-chelovek-331e717.html
https://www.vedomosti.ru/politics/articles/2016/09/02/655492-levada-tsentr-sotsiologiydu
http://metrocosm.com/tangled-web-allfiances/
http://www.cnet.com/products/sony-mdr-1000x/previewe/
http://natalie.mu/music/news/200j345
http://www.walesonline.co.uk/news/wales-news/man-launches-huntk-long-lost-11826651
http://g1.globo.com/jornal-nacional/noticia/2016/09/stf-recebe-acoes-contra-manutencao-dos-direitos-pol

In [41]:
for url in test_urls[false_pos]:
    print(url)

https://www.youtube.com/channel/UCYfdidRxbB8Qhf0Nx7ioOYw
https://www.youtube.com/channel/UC8iNz9uwDGfomRnnKKbOhOQ
http://m.so.com/app
http://video.so.com/app
http://www.so.com/help/lipei.html
http://finance.ifeng.com/a/20160816/14755028_0.shtml
http://finance.ifeng.com/a/20160816/14755028_0r.shtml
http://www.tencent.com/zh-cn/at/pr/2014.shtml
http://funklet.com/
http://www.oricon.co.jp/news/2077789/full/
http://fc2-rentalserver.com/
http://textad.net/
http://g1.globo.com/mundo/noticia/2016/09/explosao-em-plataforma-da-spacex-destruiu-foguete-e-satelite-israelense.html
http://www.purebreak.com.br/noticias/serie-stranger-things-eleven-millie-bobby-brown-pode-nao-voltar-na-2-temporada/40803
http://minimemes.net/
http://www.ktva.com/category/contests/
http://www.tworlddirect.com/handler/Common-PartnerSite?PARTNERID=tshop_partner_02&url=%2Fhandler%2FIndex-Start%3FfSiteCd%3D1111
http://www.ccifc.org/about-us/
http://www.ccifc.org/chambers-activities/
http://www.ccifc.org/membership/
http://w