In [None]:
import os
import pandas as pd

categories = [f for f in os.listdir("text") if os.path.isdir(os.path.join("text", f))]
print(categories)
articles = []
for c in categories:
    articles = articles + [(c, os.path.join("text", c, t)) for t in os.listdir(os.path.join("text", c)) if t != "LICENSE.txt"]
df = pd.DataFrame(articles, columns=["target", "data"])
df

In [None]:
from jubatus.common import Datum

datum_list = []
for d in df["data"]:
    dt = Datum()
    with open(d) as f:
        l = f.readlines()
        doc = l[2].rstrip()
        dt.add_string("title", doc) # Datumにテキストデータを追加
    datum_list.append(dt)

In [None]:
from embedded_jubatus import Classifier

config = {"converter" : {
        "string_filter_types" : {},
        "string_filter_rules" : [],
        "num_filter_types" : {},
        "num_filter_rules" : [],
        "string_types": {
                "mecab": {
                    "method": "dynamic",
                    "path": "libmecab_splitter.so",
                    "function": "create",
                    "arg": "-d /home/udagawa/local/lib/mecab/dic/ipadic",
                    "ngram": "1",
                    "base": "true",
                    "include_features": "*",
                    "exclude_features": ""
                }
        },
        "string_rules" : [
            { "key" : "*", "type" : "mecab", "sample_weight" : "bin", "global_weight" : "bin" }
        ],
        "num_types" : {},
        "num_rules" : [
            { "key" : "*", "type" : "num" }
        ]
    },
    "parameter" : {
        "regularization_weight" : 1.0
    },
    "method" : "AROW"
}
cl = Classifier(config)

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
# 訓練用、テスト用にデータセットをわける
X_train, X_test, y_train, y_test = train_test_split(df["data"], df["target"], random_state=42, stratify=df["target"])
num_splits = 4
# 交差検証の準備
kf = StratifiedKFold(n_splits=num_splits, random_state=42, shuffle=True)

In [None]:
import random
from sklearn.utils import shuffle

random.seed(42)
y_cv_results = []
for fold, indexes in enumerate(kf.split(X_train.index, y_train)):
    cl.clear()
    train_index, test_index = indexes

    # (ラベル, Datum)のリストを作る
    training_data = [(df["target"][X_train.index[i]], datum_list[X_train.index[i]]) for i in train_index]

    # Jubatusに学習させる
    cl.train(training_data)

    test_data = [datum_list[X_train.index[i]] for i in test_index]

    # Jubatusに分類させる
    result = cl.classify(test_data)

    # 分類スコアが最大のラベルを予測結果として取り出す
    y_pred = [max(x, key=lambda y:y.score).label  for x in result]

    # 正解を取り出す
    y = [df["target"][X_train.index[i]] for i in test_index]

    y_cv_results.append([y, y_pred])

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_sum = []
y_pred_sum = []
for y, y_pred in y_cv_results:
    y_sum.extend(y)
    y_pred_sum.extend(y_pred)
print(classification_report(y_sum, y_pred_sum))
print(confusion_matrix(y_sum, y_pred_sum))

In [None]:
cl.clear()
training_data = [(df["target"][i], datum_list[i]) for i in X_train.index]
test_data = [datum_list[i] for i in X_test.index]
y_test = [df["target"][i] for i in X_test.index]
cl.train(training_data)
r = cl.classify(test_data)

y_pred = [max(x, key=lambda y:y.score).label  for x in r]
report = classification_report(y_test, y_pred)
print(report)

In [None]:
cl.save("livedoor_title")

In [None]:
%%bash
jubadump -i /tmp/127.0.0.1_0_classifier_livedoor_title.jubatus > title_weights.json

In [None]:
import json
j = json.load(open("title_weights.json"))
j

In [None]:
import re
weights = {k:[] for k in categories}
index = []
for w in j["storage"]["storage"]["weight"]:
    k = re.search(r"\$.+@", w).group(0).replace("$", "").replace("@", "")
    index.append(k)
    for label in categories:
        try:
            weights[label].append(j["storage"]["storage"]["weight"][w][label]["v1"])
        except KeyError:
            weights[label].append(0)
d = pd.DataFrame(weights, index=index)

In [None]:
for c in categories:
    print(c)
    print("positive feature")
    print(d[c].sort_values(ascending=False)[:3])
    print("")
    print("negative feature")
    print(d[c].sort_values()[:3])
    print("")