In [None]:
import os
import pandas as pd
import json
import random
from jubatus.common import Datum
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from embedded_jubatus import Classifier

# データを読み込んでdataframeを作る
categories = [f for f in os.listdir("text") if os.path.isdir(os.path.join("text", f))]
print(categories)
articles = []
for c in categories:
    articles = articles + [(c, os.path.join("text", c, t)) for t in os.listdir(os.path.join("text", c)) if t != "LICENSE.txt"]
df = pd.DataFrame(articles, columns=["target", "data"])




In [None]:
# datumのリストを作成しておく
datum_list = []
for d in df["data"]:
    dt = Datum()
    with open(d) as f:
        l = f.readlines()
        doc = l[2].rstrip()
        dt.add_string("title", doc) # Datumにテキストデータを追加
    datum_list.append(dt)


In [None]:
# 訓練用、テスト用にデータセットをわける
X_train, X_test, y_train, y_test = train_test_split(df["data"], df["target"], random_state=42, stratify=df["target"])
num_splits = 4

# 交差検証の準備
kf = StratifiedKFold(n_splits=num_splits, random_state=42, shuffle=True)

In [None]:
# Jubatusの準備
config = {"converter" : {
        "string_filter_types" : {},
        "string_filter_rules" : [],
        "num_filter_types" : {},
        "num_filter_rules" : [],
        "string_types": {
                "mecab": {
                    "method": "dynamic",
                    "path": "libmecab_splitter.so",
                    "function": "create",
                    "arg": "-d /home/TkrUdagawa/local/lib/mecab/dic/ipadic",
                    "ngram": "1",
                    "base": "true",
                    "include_features": "*",
                    "exclude_features": ""
                }
        },
        "string_rules" : [
            { "key" : "*", "type" : "mecab", "sample_weight" : "bin", "global_weight" : "bin" }
        ],
        "num_types" : {},
        "num_rules" : [
            { "key" : "*", "type" : "num" }
        ]
    },
    "parameter" : {
        "regularization_weight" : 1.0
    },
    "method" : "AROW"
}
cl = Classifier(config)


In [None]:
# 交差検証実行用の関数
def do_cv(cl, n=3):
    random.seed(42)
    y_cv_results = []
    for fold, indexes in enumerate(kf.split(X_train.index, y_train)):
        cl.clear()
        train_index, test_index = indexes

        # (ラベル, Datum)のリストを作る
        training_data = [(df["target"][X_train.index[i]], datum_list[X_train.index[i]]) for i in train_index]

        # Jubatusに学習させる
        for i in range(n):
            cl.train(training_data)

        test_data = [datum_list[X_train.index[i]] for i in test_index]

        # Jubatusに分類させる
        result = cl.classify(test_data)

        # 分類スコアが最大のラベルを予測結果として取り出す
        y_pred = [max(x, key=lambda y:y.score).label  for x in result]

        # 正解を取り出す
        y = [df["target"][X_train.index[i]] for i in test_index]

        y_cv_results.append([y, y_pred])
    y_sum = []
    y_pred_sum = []
    for y, y_pred in y_cv_results:
        y_sum.extend(y)
        y_pred_sum.extend(y_pred)
    print(classification_report(y_sum, y_pred_sum, digits=4))
    print(confusion_matrix(y_sum, y_pred_sum))

# ホールドアウト検証実行用の関数
def do_holdout(cl, n):
    random.seed(42)
    training_data = [(df["target"][i], datum_list[i]) for i in X_train.index]
    test_data = [datum_list[i] for i in X_test.index]
    y_true = [df["target"][i] for i in X_test.index]

    for i in range(n):
        cl.train(training_data)
    result = cl.classify(test_data)
    y_pred = [max(x, key=lambda y:y.score).label  for x in result]

    print(classification_report(y_true=y_true, y_pred=y_pred, digits=4))


In [None]:
do_cv(cl, 1)
do_holdout(cl, 1)

In [None]:
# 繰り返し学習
do_cv(cl, 2)
do_cv(cl, 3)
do_cv(cl, 4)
do_cv(cl, 5)

In [None]:
# 辞書の変更
config = {"converter" : {
        "string_filter_types" : {},
        "string_filter_rules" : [],
        "num_filter_types" : {},
        "num_filter_rules" : [],
        "string_types": {
                "mecab": {
                    "method": "dynamic",
                    "path": "libmecab_splitter.so",
                    "function": "create",
                   "arg": "-d /home/TkrUdagawa/local/lib/mecab/dic/mecab-ipadic-neologd/",
                    "ngram": "1",
                    "base": "true",
                    "include_features": "*",
                    "exclude_features": ""
                }
        },
        "string_rules" : [
            { "key" : "*", "type" : "mecab", "sample_weight" : "bin", "global_weight" : "bin" }
        ],
        "num_types" : {},
        "num_rules" : [
            { "key" : "*", "type" : "num" }
        ]
    },
    "parameter" : {
        "regularization_weight" : 1.0
    },
    "method" : "AROW"
}
cl = Classifier(config)
do_cv(cl, 3)

In [None]:
# bi-gramの追加

config = {"converter" : {
        "string_filter_types" : {},
        "string_filter_rules" : [],
        "num_filter_types" : {},
        "num_filter_rules" : [],
        "string_types": {
                "mecab": {
                    "method": "dynamic",
                    "path": "libmecab_splitter.so",
                    "function": "create",
                    "arg": "-d /home/TkrUdagawa/local/lib/mecab/dic/ipadic",
                    "ngram": "1",
                    "base": "true",
                    "include_features": "*",
                    "exclude_features": ""
                },
                "mecab-bi": {
                    "method": "dynamic",
                    "path": "libmecab_splitter.so",
                    "function": "create",
                    "arg": "-d /home/TkrUdagawa/local/lib/mecab/dic/ipadic",
                    "ngram": "2",
                    "base": "true",
                    "include_features": "*",
                    "exclude_features": "" 
                }
        },
        "string_rules" : [
            { "key" : "*", "type" : "mecab", "sample_weight" : "bin", "global_weight" : "bin" },
            { "key" : "*", "type" : "mecab-bi", "sample_weight" : "bin", "global_weight" : "bin" }
        ],
        "num_types" : {},
        "num_rules" : [
            { "key" : "*", "type" : "num" }
        ]
    },
    "parameter" : {
        "regularization_weight" : 1.0
    },
    "method" : "AROW"
}
cl = Classifier(config)
do_cv(cl, 3)

In [None]:
# tf-idfの利用
config = {"converter" : {
        "string_filter_types" : {},
        "string_filter_rules" : [],
        "num_filter_types" : {},
        "num_filter_rules" : [],
        "string_types": {
                "mecab": {
                    "method": "dynamic",
                    "path": "libmecab_splitter.so",
                    "function": "create",
                    "arg": "-d /home/TkrUdagawa/local/lib/mecab/dic/ipadic",
                    "ngram": "1",
                    "base": "true",
                    "include_features": "*",
                    "exclude_features": ""
                },
            "mecab-bi": {
                    "method": "dynamic",
                    "path": "libmecab_splitter.so",
                    "function": "create",
                    "arg": "-d /home/TkrUdagawa/local/lib/mecab/dic/ipadic",
                    "ngram": "2",
                    "base": "true",
                    "include_features": "*",
                    "exclude_features": ""                
            }
        },
        "string_rules" : [
            { "key" : "*", "type" : "mecab", "sample_weight" : "tf", "global_weight" : "idf" },
            { "key" : "*", "type" : "mecab-bi", "sample_weight" : "tf", "global_weight" : "idf" }
        ],
        "num_types" : {},
        "num_rules" : [
            { "key" : "*", "type" : "num" }
        ]
    },
    "parameter" : {
        "regularization_weight" : 1.0
    },
    "method" : "AROW"
}
cl = Classifier(config)
do_cv(cl, 3)

In [None]:
# アルゴリズム選択、パラメータチューニング
config = {"converter" : {
        "string_filter_types" : {},
        "string_filter_rules" : [],
        "num_filter_types" : {},
        "num_filter_rules" : [],
        "string_types": {
                "mecab": {
                    "method": "dynamic",
                    "path": "libmecab_splitter.so",
                    "function": "create",
                    "arg": "-d /home/TkrUdagawa/local/lib/mecab/dic/ipadic",
                    "ngram": "1",
                    "base": "true",
                    "include_features": "*",
                    "exclude_features": ""
                },
            "mecab-bi": {
                    "method": "dynamic",
                    "path": "libmecab_splitter.so",
                    "function": "create",
                    "arg": "-d /home/TkrUdagawa/local/lib/mecab/dic/ipadic",
                    "ngram": "2",
                    "base": "true",
                    "include_features": "*",
                    "exclude_features": ""                
            }
        },
        "string_rules" : [
            { "key" : "*", "type" : "mecab", "sample_weight" : "bin", "global_weight" : "bin" },
            { "key" : "*", "type" : "mecab-bi", "sample_weight" : "bin", "global_weight" : "bin" }

        ],
        "num_types" : {},
        "num_rules" : [
            { "key" : "*", "type" : "num" }
        ]
    },
    "parameter" : {
        "regularization_weight" : 1.0
    },
    "method" : "AROW"
}

reg = [0.01, 0.1, 0.5, 1.0, 10.0]
algorithms = ["CW", "AROW"]
for alg in algorithms:
    for r in reg:
        print(alg, r)
        config["parameter"]["regularization_weight"] = r
        config["method"] = alg
        cl = Classifier(config)
        do_cv(cl, 3)

In [None]:
# COTOHA API利用のための関数の作成
import requests
import json


# 下記の情報はCOTOHA API Portalにログインすると確認できます。
CLIENT_SECRET = "CLIENT SECRETを入れる"
CLIENT_ID = "CLIENT IDを入れる"
TOKEN_URL = "TOKEN_URLを入れる"
API_BASE = "API_BASEを入れる"

def  get_token():
    """トークン認証を行う
    """
    headers = {
        "Content-Type": "application/json",
        "charset": "UTF-8"
    }
    data = {
        "grantType": "client_credentials",
        "clientId": CLIENT_ID,
        "clientSecret": CLIENT_SECRET
    }
    r = requests.post(TOKEN_URL, headers=headers, data=json.dumps(data))
    return r.json()


def parse(text, token):
    """構文解析を実行する
    """
    headers = {
        "Content-Type": "application/json",
        "charset": "UTF-8",
        "Authorization": "Bearer {}".format(token)
    }
    data = {
        "sentence": text,
        "type": "default"
    }
    r = requests.post(API_BASE + "v1/parse", headers=headers, data=json.dumps(data))
    if r.json()["status"] != 0:
        print(r.json()["status"], text)
    return r.json()


def ne(text, token):
    """固有表現抽出を行う
    """
    headers = {
        "Content-Type": "application/json",
        "charset": "UTF-8",
        "Authorization": "Bearer {}".format(token)
    }
    data = {
        "sentence": text,
        "type": "default",
        "dic_type": []
    }
    r = requests.post(API_BASE + "v1/ne", headers=headers, data=json.dumps(data))
    if r.json()["status"] != 0:
        print(r.json()["status"], text)
    return r.json()

TOKEN = get_token()["access_token"]
text = "週末映画まとめ読み】 『モテキ』初登場2位でトップ3を邦画が独占＜10月1日号＞"
print(json.dumps(parse(text, TOKEN), indent=2, ensure_ascii=False))
print(json.dumps(ne(text, TOKEN), indent=2, ensure_ascii=False))

In [None]:
# COTOHA APIの解析結果を用いる特徴抽出の準備

def get_tokens(result):
    tokens = []
    for r in result:
        for t in r["tokens"]:
            tokens.append(t)
    return tokens

def make_datum_list_with_cotoha(df, add_lemma=False,
                                add_ne_form=False, ne_filter=[]):
    datum_list = []
    for d in df["data"]:
        dt = Datum()
        with open(d) as f:
            l = f.readlines()
            doc = l[2].rstrip()
            dt.add_string("title", doc) # Datumにテキストデータを追加
    
        parse_file = d.replace("text", "parse_title").replace("txt", "json")
        ne_file = d.replace("text", "ne_title").replace("txt", "json")
        with open(parse_file) as f, open(ne_file) as ne:
            j = json.load(f)
            ne_j = json.load(ne)
            tokens = get_tokens(j["result"])
            
            # 固有表現を入れる
            for r in ne_j["result"]:
                if add_ne_form:
                    if ne_filter:
                        if r["class"] in ne_filter:
                            dt.add_number("ne-{}".format(r["form"]), 1.0)                            
                    else:
                        dt.add_number("ne-{}".format(r["form"]), 1.0)

            # token情報からlemmaを取得
            for r in j["result"]:
                for t in r["tokens"]:
                    k = "lemma-{}".format(t["lemma"])
                    v = 1.0
                    if add_lemma:
                        dt.add_number(k, v)
        datum_list.append(dt)
    print(len(datum_list))
    return datum_list

In [None]:
# lemmaのみを利用
config = {"converter" : {
        "string_filter_types" : {},
        "string_filter_rules" : [],
        "num_filter_types" : {},
        "num_filter_rules" : [],
        "string_types": {
                "mecab": {
                    "method": "dynamic",
                    "path": "libmecab_splitter.so",
                    "function": "create",
                    "arg": "-d /home/TkrUdagawa/local/lib/mecab/dic/ipadic",
                    "ngram": "1",
                    "base": "true",
                    "include_features": "*",
                    "exclude_features": ""
                },
            "mecab-bi": {
                    "method": "dynamic",
                    "path": "libmecab_splitter.so",
                    "function": "create",
                    "arg": "-d /home/TkrUdagawa/local/lib/mecab/dic/ipadic",
                    "ngram": "2",
                    "base": "true",
                    "include_features": "*",
                    "exclude_features": ""                
            }
        },
        "string_rules" : [
            { "key" : "*", "type" : "mecab", "sample_weight" : "bin", "global_weight" : "bin" },
            { "key" : "*", "type" : "mecab-bi", "sample_weight" : "bin", "global_weight" : "bin" }

        ],
        "num_types" : {},
        "num_rules" : [
            { "key" : "*", "type" : "num" }
        ]
    },
    "parameter" : {
        "regularization_weight" : 0.5
    },
    "method" : "CW"
}
cl = Classifier(config)
datum_list = make_datum_list_with_cotoha(df, add_lemma=True)
do_cv(cl, 3) 


In [None]:
# 固有表現も追加
datum_list = make_datum_list_with_cotoha(
    df, add_lemma=True,
    add_ne_form=True, ne_filter=set(["ORG", "PSN", "LOC", "ART"]))

do_cv(cl, 3) # jubatusは CW:0.5 で動作

In [None]:
# アルゴリズム選択、パラメータチューニング
reg = [0.01, 0.1, 0.5, 1.0, 10.0]
algorithms = ["CW", "AROW"]
for alg in algorithms:
    for r in reg:
        print(alg, r)
        config["parameter"]["regularization_weight"] = r
        config["method"] = alg
        cl = Classifier(config)
        do_cv(cl, 3)

In [None]:
# ホールドアウト検証
config = {"converter" : {
        "string_filter_types" : {},
        "string_filter_rules" : [],
        "num_filter_types" : {},
        "num_filter_rules" : [],
        "string_types": {
                "mecab": {
                    "method": "dynamic",
                    "path": "libmecab_splitter.so",
                    "function": "create",
                    "arg": "-d /home/TkrUdagawa/local/lib/mecab/dic/ipadic",
                    "ngram": "1",
                    "base": "true",
                    "include_features": "*",
                    "exclude_features": ""
                },
            "mecab-bi": {
                    "method": "dynamic",
                    "path": "libmecab_splitter.so",
                    "function": "create",
                    "arg": "-d /home/TkrUdagawa/local/lib/mecab/dic/ipadic",
                    "ngram": "2",
                    "base": "true",
                    "include_features": "*",
                    "exclude_features": ""                
            }
        },
        "string_rules" : [
            { "key" : "*", "type" : "mecab", "sample_weight" : "bin", "global_weight" : "bin" },
            { "key" : "*", "type" : "mecab-bi", "sample_weight" : "bin", "global_weight" : "bin" }

        ],
        "num_types" : {},
        "num_rules" : [
            { "key" : "*", "type" : "num" }
        ]
    },
    "parameter" : {
        "regularization_weight" : 0.5
    },
    "method" : "CW"
}
cl = Classifier(config)
do_holdout(cl, 7)