In [1]:
import gzip
import json

def reader():
    with gzip.open('../text_items.jl.gz', 'rt') as f:
        for line in f:
            item = json.loads(line)
            if item['status'] in {200, 404}:
                yield item

item = next(reader())
item.keys()

dict_keys(['url', 'text', 'status'])

In [2]:
import tldextract

def get_domain(url):
    return tldextract.extract(url).registered_domain.lower()

In [3]:
import numpy as np
import itertools
import random
from collections import Counter

# data = list(itertools.islice(reader(), 3000))
data = list(reader())

domain_counts = Counter(get_domain(item['url']) for item in data)
exclude_most_common = 5
most_common_domains = {domain for domain, _ in domain_counts.most_common(exclude_most_common)}
data = [item for item in data if get_domain(item['url']) not in most_common_domains]

random.shuffle(data)
len(data)

14615

In [31]:
def flt_by_indices(items, indices):
    indices = set(indices)
    return [item for idx, item in enumerate(items) if idx in indices]
    
def get_xy(items, indices=None):
    if indices is not None:
        items = flt_by_indices(items, indices)
    return (np.array([item['text'] for item in items]),
            np.array([item['status'] == 404 for item in items]))

In [5]:
from sklearn.cross_validation import LabelKFold

urls = [item['url'] for item in data]
lkf = LabelKFold(list(map(get_domain, urls)), n_folds=10)

train_idx, test_idx = next(iter(lkf))
train_idx_set = set(train_idx)
test_X, test_Y = get_xy(data, test_idx)
len(train_idx), len(test_idx)

(13153, 1462)

In [7]:
from sklearn.utils.class_weight import compute_class_weight

classes = [False, True]
print('Train class weight:', compute_class_weight('balanced', classes, test_Y))
np.random.shuffle(train_idx)
class_weight = compute_class_weight('balanced', classes, get_xy(data, train_idx[:1000])[1])
print('Test class weight: ', class_weight)
class_weight = dict(zip(classes, class_weight))

Train class weight: [ 0.59094584  3.24888889]
Test class weight:  [ 0.60096154  2.97619048]


In [76]:
from sklearn.feature_extraction.text import CountVectorizer

#vect2 = CountVectorizer(ngram_range=(1, 2), vocabulary={w for f in good_features for w in f.split()})
vect = CountVectorizer(ngram_range=(1, 1))
vect.fit(item['text'] for idx, item in enumerate(data) if idx in train_idx_set)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [9]:
def batches(lst, size):
    for idx in range(0, len(lst), size):
        yield lst[idx:idx + size]

In [85]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss='log', class_weight=None, penalty='l1')

n_epochs = 2
batch_size = 1000
for epoch in range(n_epochs):
    print('\nEpoch {} '.format(epoch + 1), end='', flush=True)
    np.random.shuffle(train_idx)
    for indices in batches(train_idx, batch_size):
        print('.', end='', flush=True)
        _x, _y = get_xy(data, indices)
        clf.partial_fit(vect.transform(_x), _y, classes=classes)


Epoch 1 ..............
Epoch 2 ..............

In [86]:
from sklearn import metrics
pred_Y = clf.predict(vect.transform(test_X))
print(metrics.classification_report(test_Y, pred_Y, target_names=['200', '404']))

             precision    recall  f1-score   support

        200       0.96      0.98      0.97      1237
        404       0.89      0.76      0.82       225

avg / total       0.95      0.95      0.95      1462



In [87]:
print('ROC AUC: {:.3f}'.format(metrics.roc_auc_score(test_Y, clf.predict_proba(vect.transform(test_X))[:,1])))

ROC AUC: 0.898


  np.exp(prob, prob)


In [61]:
def show_features(clf, vect, limit=20):
    coef = list(enumerate(clf.coef_[0]))
    coef.sort(key=lambda x: x[1], reverse=True)
    print('{} non-zero features, {} positive and {} negative:'.format(
            sum(abs(v) > 0 for _, v in coef),
            sum(v > 0 for _, v in coef),
            sum(v < 0 for _, v in coef),
        ))
    inverse = {idx: word for word, idx in vect.vocabulary_.items()}
    print()
    for idx, c in coef[:limit]:
        print('%.3f %s' % (c, inverse[idx]))
    print('...')
    for idx, c in coef[-limit:]:
        print('%.3f %s' % (c, inverse[idx]))
    return coef, inverse

In [88]:
coef, inverse = show_features(clf, vect)

71406 non-zero features, 22847 positive and 48559 negative:

636.562 found
602.832 not
452.300 page
366.914 nachrichten
360.505 il
358.560 travel
352.364 404
280.177 adobe
258.355 telebörse
249.072 jours
244.326 min
233.400 looking
215.657 find
215.395 mehr
202.451 pop
187.943 url
185.802 von
185.121 auf
185.099 contact
180.163 account
...
-341.166 01日
-345.654 00
-347.092 予約購入済み
-348.039 vote
-349.115 chicago
-368.331 ios
-370.365 pinterest
-377.602 as
-395.452 aug
-408.616 000원
-414.562 立即下载
-434.544 idea
-442.318 北京
-448.117 tibet
-644.272 english
-726.400 帖数
-726.612 最后发表
-743.136 主题
-1059.492 yahoo
-3664.856 ideas


In [89]:
good_features = [inverse[idx] for idx, weight in coef if weight > 10]
len(good_features), good_features[:10]

(2796,
 ['found',
  'not',
  'page',
  'nachrichten',
  'il',
  'travel',
  '404',
  'adobe',
  'telebörse',
  'jours'])

In [90]:
false_neg = (pred_Y != test_Y) & (test_Y == True)
false_pos = (pred_Y != test_Y) & (pred_Y == True)
print('404 classified as 200: {}'.format(false_neg.sum()))
print('200 classified as 404: {}'.format(false_pos.sum()))

404 classified as 200: 53
200 classified as 404: 21


In [91]:
test_data = flt_by_indices(data, test_idx)

In [92]:
false_neg_data = flt_by_indices(test_data, false_neg.nonzero()[0])
false_pos_data = flt_by_indices(test_data, false_pos.nonzero()[0])

In [93]:
for item in false_neg_data:
    print(item['url'])

http://natalie.mu/music/news/200j345
http://www.ftc.go.kr/info/bizinfo/communicationVibew.jsp?apv_perm_no=2006378010630200692&area1=&area2=&currpage=1&enddate=&searchKey=04&searchVal=2208162517&stdate=
http://theberry.com/2014/10/05/dont-worry-bde-happy-15-photos-45/?utm_campaign=theBERRY_weekly_v2&utm_medium=email&utm_source=Sailthru&utm_term=theBERRY+Weekly
http://www.le.com/ptv/pplay/74483
http://qiulin2011.blog.sohu.com/32267317o8.html
http://www.le.com/ptv/pplay/y75082
http://support.vip.com/investment/indeax.php?lang=en
http://www.le.com/ptv/pplay/75578
http://www.le.com/ptv/pplay/76085
http://zglbp.blog.sohu.com/3l22672349.html
https://vi.wiktionary.org/wiki/Tdrang_Ch%C3%ADnh
https://es.wiktionary.org/wiki/Wikcionaario:Portada
http://vm.ru/news/2016/09/01/poleti-nayavu-vstrechaem-artistov-so-vsego-sveta-331672.htuml
https://www.youtube.com/channel/UCOpNcN46UbXVtpKMrmrU4Abg
http://www.iqiyi.com/dongmahn/
https://www.youtube.com/channel/UC7DWJmY_p7qLzIy2-V7m7U5Q
http://mir24.tv/ne

In [94]:
for item in false_pos_data:
    print(item['url'])

http://video.so.com/app
https://www.google.co.jp/advanced_search?fg=1&hl=ja
http://www.tworlddirect.com/handler/Common-PartnerSite?PARTNERID=tshop_partner_02&url=%2Fhandler%2FIndex-Start%3FfSiteCd%3D1111
http://blogos.com/article/189011/
https://mail.aliyun.com/alimail/auth/lzogin?reurl=%2Falimail%2F
http://www.hansungmachine.co.kr/
http://www.ktva.com/category/daybreak/travel-tuesday/
http://minimemes.net/
http://www.ktva.com/category/contests/
http://w.qhimg.com/images/v2/wan_edit/zt/faq/zt2/
http://www.ktva.com/category/ktvateam/station-profile/
http://www.dzmyy.com.cn/
http://fc2-rentalserver.com/
http://read.bookan.com.cn/zgw/
http://travel.ifeng.com/
http://finance.ifeng.com/a/20160816/14755028_0.shtml
http://top.so.com/
http://video.so.com/app?autosetup=true
http://finance.ifeng.com/a/20160816/14755028_0r.shtml
http://www.the-tls.co.uk/articles/public/young-hamlet/
http://tv.sohu.com/20160901/n467162954.shtml
