In [1]:
from functools import partial
from soft404.train import *

reader = partial(file_reader, '../text_items_small.pkls')
        
item = next(reader())
item.keys()

dict_keys(['title', 'text', 'idx', 'blocks', 'url', 'status', 'lang'])

In [2]:
langs = [(item['idx'], item['lang']) for item in reader()]
for lang, count in Counter(lang for _, lang in langs).most_common(10):
    print('{:>10} {}'.format(count, lang))
lang_indices = {idx for idx, lang in langs if lang == 'en'}

     44150 en
     42449 zh-cn
      6137 ko
      3715 ru
      3061 fr
      2654 de
      2586 ja
      2259 it
      2135 pt
      1540 


In [3]:
flt_indices = lang_indices
def data(indices=None):
    if indices is not None:
        indices = set(indices) & flt_indices
    else:
        indices = flt_indices
    yield from reader(indices)

In [4]:
from sklearn.cross_validation import LabelKFold

urls = [(item['idx'], item['url']) for item in data()]

lkf = LabelKFold([get_domain(url) for _, url in urls], n_folds=10)
_train_idx, _test_idx = next(iter(lkf))
train_idx, test_idx = to_data_idx(_train_idx, urls), to_data_idx(_test_idx, urls)
test_X, test_Y = get_xy(data(test_idx))
print(len(train_idx), len(test_idx))

39735 4415


In [5]:
Counter(get_domain(item['url']) for item in data(train_idx)).most_common(10)

[('elegantweddinginvites.com', 68),
 ('baltimoremagazine.net', 68),
 ('commerce.gov', 68),
 ('petsmartcharities.org', 66),
 ('herecomethegirlsblog.com', 66),
 ('kristendukephotography.com', 66),
 ('tradeeasy.com', 64),
 ('lastscience.net', 64),
 ('ana.net', 64),
 ('livingly.com', 62)]

In [6]:
Counter(get_domain(item['url']) for item in data(test_idx)).most_common(10)

[('szdaily.com', 69),
 ('gamespot.com', 58),
 ('nerdwallet.com', 53),
 ('thejakartapost.com', 52),
 ('watches.com', 48),
 ('rio2016.com', 46),
 ('comixology.co.uk', 45),
 ('football365.com', 42),
 ('w3techs.com', 42),
 ('marketingdive.com', 40)]

In [7]:
from sklearn.utils.class_weight import compute_class_weight

classes = [False, True]
print('Test class weight:', compute_class_weight('balanced', classes, test_Y))
np.random.shuffle(train_idx)
class_weight = compute_class_weight('balanced', classes, get_xy(data(train_idx[:1000]), only_ys=True))
print('Train class weight: ', class_weight)
class_weight = dict(zip(classes, class_weight))

Test class weight: [ 0.69681187  1.7702486 ]
Train class weight:  [ 0.65359477  2.12765957]


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(ngram_range=(1, 2), max_features=50000, token_pattern=token_pattern)
vect.fit(item_to_text(item) for item in data(train_idx))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=50000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b[_\\w][_\\w]+\\b',
        tokenizer=None, vocabulary=None)

In [9]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss='log', class_weight=None, penalty='l1')
train_clf(clf, vect, data, train_idx, classes)

In [10]:
from sklearn import metrics
pred_Y = clf.predict(vect.transform(test_X))
print(metrics.classification_report(test_Y, pred_Y, target_names=['200', '404']))

             precision    recall  f1-score   support

        200       0.95      0.95      0.95      3168
        404       0.88      0.88      0.88      1247

avg / total       0.93      0.93      0.93      4415



In [11]:
print('ROC AUC: {:.3f}'.format(metrics.roc_auc_score(test_Y, clf.predict_proba(vect.transform(test_X))[:,1])))

ROC AUC: 0.915


  np.exp(prob, prob)


In [None]:
coef, inverse = show_features(clf, vect)

In [None]:
good_features = [inverse[idx] for idx, weight in coef if weight > 10]
len(good_features), good_features[:10]

In [12]:
false_neg = (pred_Y != test_Y) & (test_Y == True)
false_pos = (pred_Y != test_Y) & (pred_Y == True)
print('404 classified as 200: {}'.format(false_neg.sum()))
print('200 classified as 404: {}'.format(false_pos.sum()))

404 classified as 200: 154
200 classified as 404: 150


In [13]:
test_urls = np.array([item['url'] for item in data(test_idx)])

In [14]:
for url in test_urls[false_neg]:
    print(url)

http://poki.com/enw
https://www.npd.com/wps/portal/npd/us/news/press-releases/2015/limited-time-offer-pumpmkin-beverages-spice-up-business-for-foodservice-operators/
http://blog.myfitnesspal.com/pumpkin-spice-uhas-passed-its-peak/
http://t.qq.com/messages/imnbox?pref=qqcom.home.wbinbox
http://fineartamerica.com/featured/fuschia-flower-davidg-campione.html
https://en.wikiquote.org/wiki/Main_Pages
http://www.chipp.cn/node_83951.htm
http://www.chipp.cn/2015-12/30/content_18302510.htm
http://www.chipp.cn/node_83968.htm
http://spaceflight101.com/orbital-actk/
http://spaceflight101.com/sea-launchp/
http://spaceflight101.com/japaon/
http://spaceflight101.com/chqina/
http://www.nasa.gov/mission_pages/station/expeditions/expedition49/isndex.html
http://www.nasa.gov/specials/ocean-worldsx/
http://www.littlethings.com/tiny-mgiracles/
http://www.littlethings.com/category/dfiy-2/
http://www.littlethings.com/freshd-apples/
http://spaceflight101.com/news/feezd/
http://spaceflight101.com/soyuz-lomonom

In [15]:
for url in test_urls[false_pos]:
    print(url)

http://www.mrlovenstein.com/404
http://usa.chinadaily.com.cn/node_1086701.htm
https://www.google.de/maps?hl=de&tab=wl
https://www.nerdwallet.com/blog/shopping/daily-deals-ebay-summer-choice-deal-apple-watch-fitbit-charge-hr/
http://www.ipr.gov.cn/404.shtml
http://www.ietehui.com/
http://www.getpaint.net/?thumb=
http://www.getpaint.net/
http://www.fashiontimes.com/articles/27668/20160913/new-york-fashion-week-updates-day-6-live-stream-tory-burch-vera-wang.htm
http://www.fashiontimes.com/articles/27646/20160910/nicole-miller-ss17-runway-show-photos-new-york-fashion-week.htm
http://www.fashiontimes.com/articles/27665/20160912/christian-siriano-collection-inspired-by-the-isle-of-capri-new-york-fashion-week-september-2016.htm
http://www.fashiontimes.com/articles/27160/20160720/lee-lani-makes-miami-swim-week-runway-debut-ss17-collection.htm
http://www.youronlinechoices.com/ie/
https://www.internetalerts.org/
http://www.billboardevents.com/
http://www.livemint.com/Object/yfY9DbaXROhxhhImL9rG1