<a href="https://colab.research.google.com/github/Re14m/training/blob/master/2022-0316_recipie110.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [文書の「あいまい検索」機能をつくるレシピ](https://axross-recipe.com/recipes/110)

ローカル環境で実施しました。

In [None]:
# パッケージのインストール
!pip install numpy
!pip install gensim
!pip install mecab-python3
!pip install neologdn
!pip install tqdm

# Mecab用の辞書のインストール
!pip install unidic-lite

In [None]:
# dataset DL (https://www.rondhuit.com/download/ldcc-20140209.tar.gz)
# dataset 解凍
from shutil import unpack_archive
unpack_archive(filename="/ldcc-20140209.tar.gz", extract_dir="/data/", format="gztar")

In [None]:
# dataset 読込
from pathlib import Path
doc_dir = Path("/data/text/")
doc_paths = []
for d in doc_dir.iterdir():
    if d.is_dir():
        docs = d.glob("*.txt")
        doc_paths += list(docs)
print("ニュース記事数:", len(doc_paths))
print(doc_paths[:10])

In [None]:
# ファイルパスからニュース記事を表示する
def read_doc(path):
    with open(path,encoding="utf-8") as f:
        doc = f.read()
    return doc

In [None]:
# 出力
print(read_doc(doc_paths[0]))

In [None]:
# a.Mecabの動作テスト
import MeCab
mecab = MeCab.Tagger('-Ochasen')
data = mecab.parse('庭には２羽裏庭には２羽鶏がいる')
print(data)

In [None]:
# b.品詞を動詞,名詞,形容詞に限定して分ち書き
test = "庭には２羽裏庭には２羽鶏がいる"

l = [line.split("\t") for line in mecab.parse(test).split("\n")]
res = []
for w in l:
    if len(w) >=4: # check nomal words (e.g. not EOS)
        pos = w[3]
        base = w[2]
        group_pos = pos.split("-")[0]
        if group_pos in ["動詞","名詞","形容詞"]:
            res.append(base)
print(res)

In [None]:
# c.stopwordを表示（ノイズ除去用）
from urllib import request
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt")
stopwords = [line.decode("utf-8").strip() for line in res if len(line.strip()) > 0]
print("ストップワード数:", len(stopwords))
print(stopwords)

In [None]:
# d.stopwordを適用する
test = "庭には２羽裏庭には２羽鶏がいる"

l = [line.split("\t") for line in mecab.parse(test).split("\n")]
res = []
for w in l:
    if len(w) >=4: # check nomal words (e.g. not EOS)
        pos = w[3]
        group_pos = pos.split("-")[0]
        base = w[2]
        if group_pos in ["動詞","名詞","形容詞"] and base not in stopwords: 
            res.append(base)
print(res)

In [None]:
# a-dを文章が変わっても利用できるようにクラス化する
class Tokenizer:
    def __init__(self, stopwords=None, include_pos=None):
        tagger_cmd = "-Ochasen"
        mecab = MeCab.Tagger(tagger_cmd)
        self.parser = mecab.parse
        if stopwords is None:
            self.stopwords = []
        else:
            self.stopwords = stopwords
        if include_pos is None:
            self.include_pos = ["名詞", "動詞", "形容詞"]
        else:
            self.include_pos = include_pos

    def tokenize(self, text):
        l = [line.split("\t") for line in self.parser(text).split("\n")]
        res = []
        for w in l:
            if len(w) >=4: # check nomal words (e.g. not EOS)
                pos = w[3]
                group_pos = pos.split("-")[0]
                base = w[2]
                if group_pos in self.include_pos and base not in self.stopwords:
                    res.append(base)
        return res

In [None]:
# tokenizerクラスの出力結果
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt")
stopwords = [line.decode("utf-8").strip() for line in res if len(line.strip()) > 0]

include_pos = ["名詞", "動詞", "形容詞"]

tokenizer = Tokenizer(stopwords=stopwords, include_pos=include_pos)

words = tokenizer.tokenize("庭には２羽裏庭には２羽鶏がいる")
print(words)

In [None]:
# datasetをtokenizerにかける（nomalize）
import pprint
words = tokenizer.tokenize(read_doc(doc_paths[0]))
pprint.pprint(words, compact=True)

In [None]:
# 正規化する
import neologdn
normalized_word = neologdn.normalize("元気～～～？？？") #複数の伸ばし棒
print(normalized_word)

In [None]:
# 正規化のための関数を用意する
import re
def normalize(text):
    text = re.sub(r"http(s)?:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", " ", text) # remove web urls
    text = re.sub(r"(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\+0900", "", text) # remove time
    text = re.sub(r"\"?([-a-zA-Z0-9.`?{}]+\.jp)\"?" ," ", text) # remove .jp urls
    text = re.sub(r'\d+', ' ', text) # remove disits
    text = re.sub(r"[\(\),.:=\?？！\+><\[\]\|\"\';\n【】『』\-\!、。“■（）]", " ", text) # remove marks
    text = neologdn.normalize(text) 
    return text

In [None]:
# 抽出した文字列を更に正規化する
normalized_word =normalize(read_doc(doc_paths[0]))
print(normalized_word)

In [None]:
# 正規化した後でtokenizerにかける
words = tokenizer.tokenize(normalize(read_doc(doc_paths[0])))
pprint.pprint(words, compact=True)

In [None]:
# ベクトル化（fasttext）
# モデル作成
from gensim.models.fasttext import FastText
model = FastText(vector_size=300) 

In [None]:
# datasetすべての文字列から必要な単語だけを抽出してリスト化
from tqdm.notebook import tqdm
sentences = [tokenizer.tokenize(normalize(read_doc(p))) for p in tqdm(doc_paths)]

In [None]:
# fasttextで学習
model.build_vocab(sentences)
model.train(sentences, total_examples=len(sentences), epochs=30)

In [None]:
# 単語のベクトルを見る
vector = model.wv["猫"] 
print("次元:", vector.shape)
pprint.pprint(vector, compact=True)

In [None]:
# sentenceからベクトルを生成するリスト作る
import numpy as np
def to_vec(sentence):
    return np.mean([model.wv[w] for w in sentence], axis=0)

In [None]:
doc_vec = to_vec(sentences[0])
pprint.pprint(doc_vec, compact=True)

In [None]:
# 検索機能の作成
# ベクトル類似度=コサイン類似度
def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [None]:
v1 = [1,0] # →

v2 = [-1,0] # ← (v1と逆向き)
v3 = [0, 1] # ↑ (v1と直角)
v4 = [0.8,0.2] # v1と向きが近い
print("v1とv2の類似度:", cos_sim(v1,v2))
print("v1とv3の類似度:", cos_sim(v1,v3))
print("v1とv4の類似度:", cos_sim(v1,v4))

In [None]:
# 作成したセンテンスをすべてベクトル化
vecs_with_idx = [(idx, to_vec(s)) for idx, s in enumerate(sentences)]

In [None]:
# 類似度の高い記事を取得する関数を作成
def get_similaries(target_vec, vecs_with_idx, topn=10):
    sim_list = [(idx, cos_sim(target_vec, v)) for idx, v in vecs_with_idx]
    result = sorted(sim_list, key=lambda t: t[1], reverse=True)
    return result[:topn]

In [None]:
# 機能の確認
search_words = ["ビジネスマン"]
target_vec = to_vec(search_words) 
res = get_similaries(target_vec, vecs_with_idx, topn=10)
print(res)

In [None]:
# 結果の文書を閲覧する（1番目）
best_index = res[0][0]
print("ファイルパス:", doc_paths[best_index])
print(read_doc(doc_paths[best_index]))

In [None]:
# 結果の文書を閲覧する（2番目）
secondary_index = res[1][0]
print("ファイルパス:", doc_paths[secondary_index])
print(read_doc(doc_paths[secondary_index]))

In [None]:
# <応用> 類似した記事を出す
target_doc_index = 1
print("ファイルパス:", doc_paths[target_doc_index])
print(read_doc(doc_paths[target_doc_index])) #対象の記事を表示
target_vec = vecs_with_idx[target_doc_index][1] #対象の記事のベクトル
res = get_similaries(target_vec, vecs_with_idx, topn=10)
print(res)

In [None]:
# 結果の文書を閲覧する（１番目）
best_index = res[1][0] # 0 0は上記の検索結果文書自体を指している
print("ファイルパス:", doc_paths[best_index])
print(read_doc(doc_paths[best_index]))

おまけのため、今回は環境を作らずに手順のみ記載

In [None]:
# おまけ（検索精度を上げるための方法）①

# https://github.com/neologd/mecab-ipadic-neologdからDL
# neologd利用
text = "猫ひろし"
MeCab.Tagger('-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
result = tagger.parse(text)
print(result)

In [None]:
# ochasen利用
tagger_cmd = "-Ochasen" 
print(mecab.parse("猫ひろし"))

In [None]:
# おまけ（検索精度を上げるための方法）②
# https://fasttext.cc/docs/en/crawl-vectors.htmlからDL
# fasttextの学習済みモデルの利用
from gensim.models.fasttext import load_facebook_model
model = load_facebook_model("cc.ja.300.bin.gz")
model.build_vocab(sentences=sentences, update=True)
model.train(sentences=sentences, total_examples=len(sentences), epochs=30)