# tokenについてmecabとspacy/ginzaの比較

In [2]:
from collections import Counter
import os
import re
from typing import List, Tuple, Dict, Union
import unicodedata as ud

import fire
import pandas as pd
import numpy as np
Num = Union[int, float]
STOPPOS_JP = ["形容動詞語幹", "副詞可能", "代名詞", "ナイ形容詞語幹", "特殊", "数", "接尾", "非自立"]

In [28]:
with open(os.path.expanduser("/Users/itaru-ot/Documents/ra_provisional/provisional_limco/stopwords_jp.txt"), "r") as f:
    STOPWORDS_JP = [line.strip() for line in f]
STOPPOS_JP = ["形容動詞語幹", "副詞可能", "代名詞", "ナイ形容詞語幹", "特殊", "数", "接尾", "非自立"]

with open(os.path.expanduser("/Users/itaru-ot/Documents/ra_provisional/provisional_limco/AWD-J_EX.txt"), "r") as f:
    rows = [line.strip().split("\t") for line in f]
    AWD = {word: score for word, score, _, _ in rows}
DF_jiwc = pd.read_csv(os.path.expanduser("/Users/itaru-ot/Documents/ra_provisional/provisional_limco/2017-11-JIWC.csv"), index_col=1).drop(
    columns="Unnamed: 0"
)

In [3]:
from natto import MeCab
NM = MeCab()  # NOTE: assume IPADIC
NMN = MeCab("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")

In [89]:
import spacy
import ginza
NLP = spacy.load('ja_ginza')

In [6]:
text = '吾輩は猫である。名前はまだ無い。'

In [7]:
def tokens(text: str) -> np.ndarray:
    tokens = [
        (n.surface, n.feature.split(","))
        for n in NM.parse(text, as_nodes=True)
        if not n.is_eos()
    ]
    return tokens

In [8]:
tokens(text)

[('吾輩', ['名詞', '代名詞', '一般', '*', '*', '*', '吾輩', 'ワガハイ', 'ワガハイ']),
 ('は', ['助詞', '係助詞', '*', '*', '*', '*', 'は', 'ハ', 'ワ']),
 ('猫', ['名詞', '一般', '*', '*', '*', '*', '猫', 'ネコ', 'ネコ']),
 ('で', ['助動詞', '*', '*', '*', '特殊・ダ', '連用形', 'だ', 'デ', 'デ']),
 ('ある', ['助動詞', '*', '*', '*', '五段・ラ行アル', '基本形', 'ある', 'アル', 'アル']),
 ('。', ['記号', '句点', '*', '*', '*', '*', '。', '。', '。']),
 ('名前', ['名詞', '一般', '*', '*', '*', '*', '名前', 'ナマエ', 'ナマエ']),
 ('は', ['助詞', '係助詞', '*', '*', '*', '*', 'は', 'ハ', 'ワ']),
 ('まだ', ['副詞', '助詞類接続', '*', '*', '*', '*', 'まだ', 'マダ', 'マダ']),
 ('無い', ['形容詞', '自立', '*', '*', '形容詞・アウオ段', '基本形', '無い', 'ナイ', 'ナイ']),
 ('。', ['記号', '句点', '*', '*', '*', '*', '。', '。', '。'])]

In [142]:
def tokens_ginza(text: str) -> np.ndarray:
    text = text.replace('一\n\n　', '')
    doc = NLP(text)
    tokens = []
    for sent in doc.sents:
        for token in sent:
            token_tag = re.split('[,-]', token.tag_)
            token_inflection = re.split('[,-]', ginza.inflection(token))
            analysis = token_tag + token_inflection
            analysis.append(token.lemma_)
            analysis += re.split('[,-]', ginza.reading_form(token))
            tuple_ = (token.orth_, analysis)
            tokens.append(tuple_)
    return tokens

In [143]:
tokens_ginza(text)

[('吾輩', ['代名詞', '', '吾輩', 'ワガハイ']),
 ('は', ['助詞', '係助詞', '', 'は', 'ハ']),
 ('猫', ['名詞', '普通名詞', '一般', '', '猫', 'ネコ']),
 ('で', ['助動詞', '助動詞', 'ダ', '連用形', '一般', 'だ', 'デ']),
 ('ある', ['動詞', '非自立可能', '五段', 'ラ行', '終止形', '一般', 'ある', 'アル']),
 ('。', ['補助記号', '句点', '', '。', '。']),
 ('名前', ['名詞', '普通名詞', '一般', '', '名前', 'ナマエ']),
 ('は', ['助詞', '係助詞', '', 'は', 'ハ']),
 ('まだ', ['副詞', '', 'まだ', 'マダ']),
 ('無い', ['形容詞', '非自立可能', '形容詞', '終止形', '一般', '無い', 'ナイ']),
 ('。', ['補助記号', '句点', '', '。', '。'])]

In [75]:
def measure_pos(text: str) -> np.ndarray:
    tokens = [
        (n.surface, n.feature.split(","))
        for n in NM.parse(text, as_nodes=True)
        if not n.is_eos()
    ]
    # print(tokens)

    # VERB RELATED MEASURES
    verbs = [token for token in tokens if token[1][0] == "動詞"]
    # TODO: 助動詞との連語も含める？
    # lens_verb = [len(verb) for verb in verbs]

    # CONTENT WORDS RATIO
    nouns = [token for token in tokens if token[1][0] == "名詞"]
    adjcs = [token for token in tokens if token[1][0] == "形容詞"]
    content_words = verbs + nouns + adjcs
    cwr_simple = np.divide(len(content_words), len(tokens))
    cwr_advance = np.divide(
        len(
            [
                token
                for token in content_words
                if (token[1][1] not in STOPPOS_JP) and (token[0] not in STOPWORDS_JP)
            ]
        ),
        len(tokens),
    )

    # Modifying words and verb ratio (MVR)
    advbs = [token for token in tokens if token[1][0] == "副詞"]
    padjs = [token for token in tokens if token[1][0] == "連体詞"]
    mvr = np.divide(len(adjcs + advbs + padjs), len(verbs))

    # NER
    ners = [token for token in tokens if token[1][1] == "固有名詞"]
    nerr = np.divide(len(ners), len(tokens))

    # TTR
    ttrs = 0 #calc_ttrs(tokens)

    return verbs, nouns, adjcs, content_words, cwr_simple, cwr_advance, advbs, mvr, padjs, nerr, ttrs

In [144]:
def measure_pos_ginza(text: str) -> np.ndarray:
    doc = NLP(text.replace('一\n\n　', ''))
    tokens = []
    for sent in doc.sents:
        for token in sent:
            token_tag = re.split('[,-]', token.tag_)
            token_inflection = re.split('[,-]', ginza.inflection(token))
            analysis = token_tag + token_inflection
            analysis.append(token.lemma_)
            analysis += re.split('[,-]', ginza.reading_form(token))
            tuple_ = (token.orth_, analysis)
            tokens.append(tuple_)
    
    # VERB RELATED MEASURES
    verbs = [token for token in tokens if token[1][0] == "動詞"]
    
    # CONTENT WORDS RATIO
    nouns = []
    for token in tokens:
        match = re.search(r"名詞$", token[1][0])
        if match is not None:
            nouns.append(token)
    adjcs = [token for token in tokens if token[1][0] == "形容詞"]
    content_words = verbs + nouns + adjcs
    cwr_simple = np.divide(len(content_words), len(tokens))
    cwr_advance = np.divide(
        len(
            [
                token
                for token in content_words
                if (token[1][1] not in STOPPOS_JP) and (token[0] not in STOPWORDS_JP)
            ]
        ),
        len(tokens),
    )

    # Modifying words and verb ratio (MVR)
    advbs = [token for token in tokens if token[1][0] == "副詞"]
    padjs = [token for token in tokens if token[1][0] == "連体詞"]
    mvr = np.divide(len(adjcs + advbs + padjs), len(verbs))

    # NER
    ners = [token for token in tokens if token[1][1] == "固有名詞"]
    nerr = np.divide(len(ners), len(tokens))

    # TTR
    ttrs = 0 #calc_ttrs(tokens)

    return verbs, nouns, adjcs, content_words, cwr_simple, cwr_advance, advbs, mvr, padjs, nerr, ttrs

In [76]:
measure_pos(text)

  mvr = np.divide(len(adjcs + advbs + padjs), len(verbs))


([],
 [('吾輩', ['名詞', '代名詞', '一般', '*', '*', '*', '吾輩', 'ワガハイ', 'ワガハイ']),
  ('猫', ['名詞', '一般', '*', '*', '*', '*', '猫', 'ネコ', 'ネコ']),
  ('名前', ['名詞', '一般', '*', '*', '*', '*', '名前', 'ナマエ', 'ナマエ'])],
 [('無い', ['形容詞', '自立', '*', '*', '形容詞・アウオ段', '基本形', '無い', 'ナイ', 'ナイ'])],
 [('吾輩', ['名詞', '代名詞', '一般', '*', '*', '*', '吾輩', 'ワガハイ', 'ワガハイ']),
  ('猫', ['名詞', '一般', '*', '*', '*', '*', '猫', 'ネコ', 'ネコ']),
  ('名前', ['名詞', '一般', '*', '*', '*', '*', '名前', 'ナマエ', 'ナマエ']),
  ('無い', ['形容詞', '自立', '*', '*', '形容詞・アウオ段', '基本形', '無い', 'ナイ', 'ナイ'])],
 0.36363636363636365,
 0.18181818181818182,
 [('まだ', ['副詞', '助詞類接続', '*', '*', '*', '*', 'まだ', 'マダ', 'マダ'])],
 inf,
 [],
 0.0,
 0)

In [145]:
measure_pos_ginza(text)

([('ある', ['動詞', '非自立可能', '五段', 'ラ行', '終止形', '一般', 'ある', 'アル'])],
 [('吾輩', ['代名詞', '', '吾輩', 'ワガハイ']),
  ('猫', ['名詞', '普通名詞', '一般', '', '猫', 'ネコ']),
  ('名前', ['名詞', '普通名詞', '一般', '', '名前', 'ナマエ'])],
 [('無い', ['形容詞', '非自立可能', '形容詞', '終止形', '一般', '無い', 'ナイ'])],
 [('ある', ['動詞', '非自立可能', '五段', 'ラ行', '終止形', '一般', 'ある', 'アル']),
  ('吾輩', ['代名詞', '', '吾輩', 'ワガハイ']),
  ('猫', ['名詞', '普通名詞', '一般', '', '猫', 'ネコ']),
  ('名前', ['名詞', '普通名詞', '一般', '', '名前', 'ナマエ']),
  ('無い', ['形容詞', '非自立可能', '形容詞', '終止形', '一般', '無い', 'ナイ'])],
 0.45454545454545453,
 0.2727272727272727,
 [('まだ', ['副詞', '', 'まだ', 'マダ'])],
 2.0,
 [],
 0.0,
 0)