In [34]:
import json_lines, json, gzip, csv
from itertools import islice
from collections import Counter
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
with gzip.open('../../dmoz/content_url_topics.csv.gz', 'rt') as f:
    topics_by_url = dict(csv.reader(f))

In [77]:
xs_topics = []
def gen(limit=None):
    xs_topics[:] = []
    with json_lines.open('../dmoz-text.jl.gz') as f:
        items = f if limit is None else islice(f, limit)
        for item in items:
            xs_topics.append(topics_by_url.get(item['url'], ''))
            yield item['text']

vec = TfidfVectorizer(max_features=200000)
all_xs = vec.fit_transform(gen(limit=100000))

In [94]:
N = len(xs_topics)
indices = np.arange(N)
np.random.shuffle(indices)
all_xs = all_xs[indices]
xs_topics = [xs_topics[idx] for idx in indices]

n_valid = min(5000, int(N * 0.2))
xs_valid = all_xs[:n_valid]
xs_train = all_xs[n_valid:]

all_ys = [[t for t in item_topics.split('/')[1:]
           if not (len(t) == 1 and t.isupper())]
          for item_topics in xs_topics]
n_topics = 1000
topic_counts = Counter(t for item_topics in all_ys for t in item_topics)
most_common_topics = {t for t, _ in topic_counts.most_common(n_topics)}
all_ys = [[t for t in item_topics if t in most_common_topics]
          for item_topics in all_ys]

ys_valid = all_ys[:n_valid]
ys_train = all_ys[n_valid:]

len(ys_valid), len(ys_train)

(5000, 95000)

In [92]:
topic_counts.most_common(10)

[('World', 46353),
 ('Regional', 34901),
 ('North_America', 19029),
 ('United_States', 18497),
 ('Localities', 14090),
 ('Deutsch', 12404),
 ('Europa', 11644),
 ('Europe', 9831),
 ('Business_and_Economy', 8732),
 ('Français', 6442)]

In [95]:
topic_counts.most_common(n_topics)[-10:]

[('Artes_escénicas', 62),
 ('Zuhause', 62),
 ('Localităţi', 62),
 ('Antwerpen', 62),
 ('Religia_i_duchowość', 61),
 ('Zeitschriften_und_Online-Magazine', 61),
 ('Kultur_und_Unterhaltung', 61),
 ('Winter_Sports', 61),
 ('Zakupy', 61),
 ('Services_aux_entreprises', 61)]

In [63]:
import random

def get_ys(ys):
    return [label_ids[random.choice(item_topics) if item_topics else no_topic]
            for item_topics in ys]    

In [96]:
from sklearn.metrics import log_loss

def get_log_loss(xs, ys): 
    return log_loss(get_ys(ys), clf.predict_proba(xs), labels=classes)

In [97]:
from sklearn.linear_model import SGDClassifier

no_topic = 'no_topic'
assert no_topic not in most_common_topics
labels = sorted(most_common_topics)
labels.append(no_topic)
label_ids = {label: idx for idx, label in enumerate(labels)}
classes = sorted(label_ids.values())

clf = SGDClassifier(loss='log', n_jobs=-1)
for epoch in range(10):
    clf.partial_fit(xs_train, get_ys(ys_train), classes=classes)
    print('Epoch {epoch}: train log loss: {train:.2f}, valid log loss: {valid:.2f}'.format(
        epoch=epoch + 1,
        train=get_log_loss(xs_train[:n_valid], ys_train[:n_valid]),
        valid=get_log_loss(xs_valid, ys_valid)))

Epoch 1: train log loss: 5.23, valid log loss: 5.25
Epoch 2: train log loss: 5.18, valid log loss: 5.20
Epoch 3: train log loss: 5.16, valid log loss: 5.16
Epoch 4: train log loss: 5.15, valid log loss: 5.16
Epoch 5: train log loss: 5.12, valid log loss: 5.18
Epoch 6: train log loss: 5.14, valid log loss: 5.16
Epoch 7: train log loss: 5.15, valid log loss: 5.14
Epoch 8: train log loss: 5.11, valid log loss: 5.12
Epoch 9: train log loss: 5.11, valid log loss: 5.14
Epoch 10: train log loss: 5.10, valid log loss: 5.11


In [113]:
import pickle
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(vec, clf)
with open('../dmoz_sklearn.pkl', 'wb') as f:
    pickle.dump({'pipeline': pipeline, 'labels': labels}, f,
                protocol=pickle.HIGHEST_PROTOCOL)

In [115]:
! ls -lh ../dmoz_sklearn.pkl

-rw-rw-r-- 1 kostia kostia 1,6G дек  8 14:15 ../dmoz_sklearn.pkl
