In [None]:
import pandas as pd
import os
categories = [f for f in os.listdir("text") if os.path.isdir(os.path.join("text", f))]
print(categories)
articles = []
for c in categories:
    articles = articles + [(c, os.path.join("text", c, t)) for t in os.listdir(os.path.join("text", c)) if t != "LICENSE.txt"]
df = pd.DataFrame(articles, columns=["target", "data"])

In [None]:
from jubatus.common import Datum

datum_list = []
for d in df["data"]:
    dt = Datum()
    with open(d) as f:
        l = f.readlines()
        doc = l[2].rstrip()
        # doc = "".join(l[3:]).replace("\n", "")
        dt.add_string("body", doc)
    datum_list.append(dt)

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold

X_train, X_test, y_train, y_test = train_test_split(df["data"], df["target"], random_state=42, stratify=df["target"])
num_splits = 4
kf = StratifiedKFold(n_splits=num_splits, random_state=42, shuffle=True)

In [None]:
from embedded_jubatus import Classifier
cl = Classifier("./jubatus_config/arow.json")

In [None]:
# cross validation
import random
from sklearn.utils import shuffle
random.seed(42)
y_cv_results = []
for fold, indexes in enumerate(kf.split(X_train.index, y_train)):
    cl.clear()
    print("{} fold".format(fold))
    train_index, test_index = indexes
    print(len(train_index), len(test_index))
    training_data = [(df["target"][X_train.index[i]], datum_list[X_train.index[i]]) for i in train_index]
    cl.train(training_data)
    test_data = [datum_list[y_train.index[i]] for i in test_index]
    y = [df["target"][y_train.index[i]] for i in test_index]
    result = cl.classify(test_data)
    y_pred = [max(x, key=lambda y:y.score).label  for x in result]
    y_cv_results.append([y, y_pred])

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_sum = []
y_pred_sum = []
for y, y_pred in y_cv_results:
    # print(classification_report(y, y_pred))
    # print(confusion_matrix(y, y_pred))
    y_sum.extend(y)
    y_pred_sum.extend(y_pred)
print(classification_report(y_sum, y_pred_sum))
print(confusion_matrix(y_sum, y_pred_sum))

In [None]:
# holdout
cl.clear()
training_data = [(df["target"][i], datum_list[i]) for i in X_train.index]
test_data = [datum_list[i] for i in X_test.index]
y_test = [df["target"][i] for i in X_test.index]
cl.train(training_data)
r = cl.classify(test_data)

In [None]:
y_pred = [max(x, key=lambda y:y.score).label  for x in r]
report = classification_report(y_test, y_pred)
print(report)
print(confusion_matrix(y_test, y_pred))