In [3]:
import struct
import pickle
        
def reader(indices=None):
    if indices is not None:
        indices = set(indices)
    with open('../text_items_small.pkls', 'rb') as f:
        idx = 0
        while True:
            size_data = f.read(4)
            if not size_data:
                break
            size, = struct.unpack('i', size_data)
            if indices is None or idx in indices:
                item = pickle.loads(f.read(size))
                item['idx'] = idx
                if item['status'] in {200, 404}:
                    yield item
            else:
                f.seek(f.tell() + size)
            idx += 1

item = next(reader())
item.keys()

dict_keys(['url', 'blocks', 'title', 'idx', 'text', 'status'])

In [4]:
import tldextract

def get_domain(url):
    return tldextract.extract(url).registered_domain.lower()

In [5]:
import numpy as np
import itertools
import random
from collections import Counter

# data = lambda: itertools.islice(data(), 3000)

domain_counts = Counter(get_domain(item['url']) for item in reader())
# TODO - exclude by count, not a fixed number
print(domain_counts.most_common(5))
exclude_most_common = 5
most_common_domains = {domain for domain, _ in domain_counts.most_common(exclude_most_common)}
data = lambda *args, **kwargs: (item for item in reader(*args, **kwargs)
                                if get_domain(item['url']) not in most_common_domains)

[('msn.com', 2382), ('microsoftstore.com', 1080), ('jia360.com', 500), ('nuomi.com', 387), ('tradedoubler.com', 362)]


In [6]:
def get_xy(items, only_ys=False):
    xs = []
    ys = []
    for item in items:
        if not only_ys:
            xs.append(item['text'])
        ys.append(item['status'] == 404)
    ys = np.array(ys)
    return ys if only_ys else (xs, ys)

In [7]:
from sklearn.cross_validation import LabelKFold

urls = [(item['idx'], item['url']) for item in data()]
def to_data_idx(indices):
    indices = set(indices)
    result = [data_idx for idx, (data_idx, _) in enumerate(urls) if idx in indices]
    assert len(result) == len(indices)
    return result

lkf = LabelKFold([get_domain(url) for _, url in urls], n_folds=10)
_train_idx, _test_idx = next(iter(lkf))
train_idx, test_idx = to_data_idx(_train_idx), to_data_idx(_test_idx)
test_X, test_Y = get_xy(data(test_idx))

In [8]:
from sklearn.utils.class_weight import compute_class_weight

classes = [False, True]
print('Test class weight:', compute_class_weight('balanced', classes, test_Y))
np.random.shuffle(train_idx)
class_weight = compute_class_weight('balanced', classes, get_xy(data(train_idx[:1000]), only_ys=True))
print('Train class weight: ', class_weight)
class_weight = dict(zip(classes, class_weight))

Test class weight: [ 0.57600636  3.78919861]
Train class weight:  [ 0.58616647  3.40136054]


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

#vect2 = CountVectorizer(ngram_range=(1, 2), vocabulary={w for f in good_features for w in f.split()})
vect = CountVectorizer(ngram_range=(1, 1))
vect.fit(item['text'] for item in data(train_idx))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [10]:
def batches(lst, size):
    for idx in range(0, len(lst), size):
        yield lst[idx:idx + size]

In [11]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss='log', class_weight=None, penalty='l1')

n_epochs = 2
batch_size = 5000
for epoch in range(n_epochs):
    print('\nEpoch {} '.format(epoch + 1), end='', flush=True)
    np.random.shuffle(train_idx)
    for indices in batches(train_idx, batch_size):
        print('.', end='', flush=True)
        _x, _y = get_xy(data(indices))
        clf.partial_fit(vect.transform(_x), _y, classes=classes)


Epoch 1 ....
Epoch 2 ....

In [12]:
from sklearn import metrics
pred_Y = clf.predict(vect.transform(test_X))
print(metrics.classification_report(test_Y, pred_Y, target_names=['200', '404']))

             precision    recall  f1-score   support

        200       0.97      0.97      0.97      1888
        404       0.78      0.77      0.77       287

avg / total       0.94      0.94      0.94      2175



In [13]:
print('ROC AUC: {:.3f}'.format(metrics.roc_auc_score(test_Y, clf.predict_proba(vect.transform(test_X))[:,1])))

ROC AUC: 0.875


  np.exp(prob, prob)


In [28]:
import langdetect

def get_lang(text):
    try:
        langs = langdetect.detect_langs(text)
    except langdetect.lang_detect_exception.LangDetectException:
        return ''
    else:
        return langs[0].lang

test_langs = np.array([get_lang(text) for text in test_X])

In [31]:
Counter(test_langs).most_common(10)

[('en', 1248),
 ('zh-cn', 406),
 ('ja', 127),
 ('ko', 100),
 ('de', 77),
 ('fr', 52),
 ('it', 34),
 ('es', 32),
 ('ru', 30),
 ('', 29)]

In [32]:
for lang, count in Counter(test_langs).most_common(5):
    print('\nLanguage "{}" with {} test samples:'.format(lang, count))
    lang_idx = test_langs == lang
    print(metrics.classification_report(test_Y[lang_idx], pred_Y[lang_idx], target_names=['200', '404']))


Language "en" with 1248 test samples:
             precision    recall  f1-score   support

        200       0.97      0.97      0.97      1037
        404       0.85      0.85      0.85       211

avg / total       0.95      0.95      0.95      1248


Language "zh-cn" with 406 test samples:
             precision    recall  f1-score   support

        200       0.99      0.99      0.99       394
        404       0.67      0.83      0.74        12

avg / total       0.99      0.98      0.98       406


Language "ja" with 127 test samples:
             precision    recall  f1-score   support

        200       0.89      0.92      0.90       110
        404       0.31      0.24      0.27        17

avg / total       0.81      0.83      0.82       127


Language "ko" with 100 test samples:
             precision    recall  f1-score   support

        200       0.89      0.95      0.92        85
        404       0.56      0.33      0.42        15

avg / total       0.84      0.86      

  'precision', 'predicted', average, warn_for)


In [14]:
def show_features(clf, vect, limit=20):
    coef = list(enumerate(clf.coef_[0]))
    coef.sort(key=lambda x: x[1], reverse=True)
    print('{} non-zero features, {} positive and {} negative:'.format(
            sum(abs(v) > 0 for _, v in coef),
            sum(v > 0 for _, v in coef),
            sum(v < 0 for _, v in coef),
        ))
    inverse = {idx: word for word, idx in vect.vocabulary_.items()}
    print()
    for idx, c in coef[:limit]:
        print('%.3f %s' % (c, inverse[idx]))
    print('...')
    for idx, c in coef[-limit:]:
        print('%.3f %s' % (c, inverse[idx]))
    return coef, inverse

In [15]:
coef, inverse = show_features(clf, vect)

71774 non-zero features, 23243 positive and 48531 negative:

688.888 found
445.767 page
439.469 not
355.587 jersey
278.512 looking
256.676 team
254.375 our
252.155 sports
250.977 404
229.840 day
219.138 link
218.999 privacy
218.659 county
215.893 list
203.594 log
203.054 we
200.588 news
194.744 free
192.996 ago
192.890 6pm
...
-629.734 soccer
-673.459 zł
-681.742 wiktionary
-771.850 und
-776.014 us
-778.663 販売価格
-808.476 download
-818.750 org0000https
-922.508 die
-962.915 sie
-1076.499 wikipedia
-1335.693 der
-1421.728 在庫切れのため
-1421.821 取り扱いが終了したため
-1496.346 out
-1647.605 ｔポイント10倍
-1691.107 sold
-1751.394 数量
-1751.802 カゴへ
-2846.576 現在ご購入いただけません


In [16]:
good_features = [inverse[idx] for idx, weight in coef if weight > 10]
len(good_features), good_features[:10]

(2075,
 ['found',
  'page',
  'not',
  'jersey',
  'looking',
  'team',
  'our',
  'sports',
  '404',
  'day'])

In [17]:
false_neg = (pred_Y != test_Y) & (test_Y == True)
false_pos = (pred_Y != test_Y) & (pred_Y == True)
print('404 classified as 200: {}'.format(false_neg.sum()))
print('200 classified as 404: {}'.format(false_pos.sum()))

404 classified as 200: 65
200 classified as 404: 64


In [18]:
test_urls = np.array([item['url'] for item in data(test_idx)])

In [19]:
for url in test_urls[false_neg]:
    print(url)

https://teaser-trailer.com:443/movie/cozmplete-unknown/
http://www.supergoldenbakes.com/2014/03/coconut-oatmeal-with-blueberry-compote.htmlr
http://www.supergoldenbakes.com/2016/06/fully-loadqed-breakfast-frittata-with.html
http://news.mynavi.jp/news/2016/09/12/3w45/
http://news.mynavi.jp/news/2016/09/12/27v5/
http://ascii.jp/elem/000/001/228/1228j057/
http://news.mynavi.jp/articles/2016/09/12/snsg/?google_editors_picks=true
http://eastcoastmommyblog.blogspot.ca/2014/10/pool-ntoodle-witch-legs.html
http://www.behance.net/jewxelryportfolio
http://get.adobe.com/flashqplayer/
http://get.adobe.com/flashplayeqr/
http://www.huffingtonpost.co.uk/entry/toby-young-new-grammar-schools-free-schools-theresa-may-social-mobility_uk_5e7d66bb7e4b0ced6a099837a
http://get.adobe.com/flaashplayer/
http://get.adobe.com/fzlashplayer/
http://get.adobe.com/flashprlayer/
https://yho.com/ffsigqnup
https://yho.com/ffsigenup
http://get.adobe.com/flashplayegr/
http://www.purewow.com/entertainment/things-to-stop-pa

In [20]:
for url in test_urls[false_pos]:
    print(url)

https://www.microsoft.com/en-us/store/apps/windows
https://www.microsoft.com/en-us/store/top-free/apps/pc
https://www.microsoft.com/en-us/store/apps/windows-phone
http://www.xiami.com/
http://djbox.dj129.com/index/d2plsbg93.html
http://djbox.dj129.com/index/d2lsbg93.html
http://www.jn.pt/pessoas/interior/morreu-alexis-arquette-atriz-transexual-do-pulp-fiction-5384643.html
http://pin-fitness.com/get-know-chakras/
http://www.europe1.fr/international/en-chine-le-pont-le-plus-haut-du-monde-sera-bientot-acheve-2844394
http://www.lemonde.fr/economie/article/2016/09/12/fin-de-partie-pour-areva-dans-l-eolien-en-mer_4996283_3234.html
http://www.lavoixdunord.fr/france-monde/curiosity-nous-rapporte-de-nouvelles-photos-stupefiantes-ia0b0n3726097
http://www.lavoixdunord.fr/region/cappelle-la-grande-une-secretaire-de-mairie-fait-un-ia17b47582n3723487
https://propellerads.com/it/
https://v2.propellerads.com/
http://vmestoputina.ru/about/
http://www.vdfly.com/
http://www.match.com/errors/customError.a