<h1 align="center">中文資訊檢索（文件檢索）採用 TF-IDF 方法</h1>
<h3 align="center">Chinese Document Retrieval with TF-IDF</h3>
<hr>
<pre>
doc: 文件資料夾（純文字，utf-8）
qry: 查詢字串
</pre>

<hr>
<h3>詞法分析，工具函數定義（刪除非中文的所有文字與符號）</h3>

In [None]:
# 詞法分析，工具函數定義（刪除非中文的所有文字與符號）

import re

def remove_non_chinese(line):
    # 消除英文文數字
    rule = re.compile('[a-zA-Z0-9]')
    line = rule.sub(' ', line)
    # 消除特殊符號（含部分全形符號）
    rule = re.compile('[’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~\s]+')
    line = rule.sub(' ', line)
    # 消除不可見字碼
    rule = re.compile('[\001\002\003\004\005\006\007\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a]+')
    line = rule.sub(' ', line)
    # 消除所有全形符號
    rule = re.compile('[^\u4e00-\u9fa5]')
    line = rule.sub(' ', line)
    return line

def remove_redundant_space(line):
    line = re.sub(' +', ' ', line)
    return line


<hr>
<h3>詞法分析，載入 jieba 工具模組</h3>

In [None]:
# 詞法分析，載入 jieba 模組

import jieba

# 有必要的話載入常用辭典
jieba.set_dictionary('dict.txt.big')
# 有必要的話載入專屬字典
jieba.load_userdict('user.txt')


<hr>
<h3>過濾無意義詞彙（stopwords，虛字與其它無意義詞彙）</h3>

In [None]:
# 過濾無意義詞彙（stopwords，虛字與其它無意義詞彙）

stopwords = [ '之', '乎', '者', '也', '的' ]

def remove_stopword(lst0):
    lst = []
    for x in lst0:
        if x not in stopwords:
            lst.append(x)
    return lst


<hr>
<h3>詞法分析，函數定義</h3>

In [None]:
# 詞法分析，函數定義

def lexical_analyzer(txt):
    txt = remove_non_chinese(txt)
    txt = remove_redundant_space(txt)
    res = jieba.cut(txt)
    lst = [ x for x in res ]
    lst = remove_stopword(lst)
    doc = ' '.join(lst)
    doc = re.sub(' +', ' ', doc)
    return doc
    

<hr>
<h3>整理 Corpus 資料格式</h3>

In [None]:
# 整理 Corpus 資料格式

import glob
import pickle

files = glob.glob('doc/*.txt')

corpus = dict()

for f in files:
    with open(f, 'r', encoding='utf-8') as fp:
        txt = fp.read()
    fp.close()
    corpus[f] = { 'name':f, 'txt':txt, 'doc':None }

for f in corpus:
    print(f)
    txt = corpus[f]['txt']
    doc = lexical_analyzer(txt)
    corpus[f]['doc'] = doc

with open('corpus_harry_potter.pkl', 'wb') as fp:
    pickle.dump(corpus, fp)
fp.close()


<hr>
<h3>TF-IDF 文件檢索</h3>

In [None]:
# TF-IDF 文件檢索

from sklearn.feature_extraction.text import TfidfVectorizer

qry = '分類咒語'

vectorizer = TfidfVectorizer()

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

d = dict()
for f in corpus:
    text1 = lexical_analyzer(qry)
    text2 = corpus[f]['doc']
    c = cosine_sim(text1, text2)
    d[f] = c

# 字典排序（成為串列）
lst = sorted(d.items(), key=lambda x: x[1], reverse=True)

# 顯示排序結果
print('查詢：', qry)
for f, c in lst:
    t = '關連度：%12.10f，文章：%s' % (c, f)
    print(t)


<hr>
<h3 style="color:orange">『朱自清散文』語料的資料檢索平台建立（整合版）</h3>
<pre>
語料來源：
<a href="http://www.bwsk.net/mj/z/zhuziqing/index.html">http://www.bwsk.net/mj/z/zhuziqing/index.html</a>
</pre>

In [None]:
# 『朱自清散文』語料的資料檢索平台建立（整合版）

# 詞法分析，工具函數定義（刪除非中文的所有文字與符號）

import re

def remove_non_chinese(line):
    # 消除英文文數字
    rule = re.compile('[a-zA-Z0-9]')
    line = rule.sub(' ', line)
    # 消除特殊符號（含部分全形符號）
    rule = re.compile('[’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~\s]+')
    line = rule.sub(' ', line)
    # 消除不可見字碼
    rule = re.compile('[\001\002\003\004\005\006\007\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a]+')
    line = rule.sub(' ', line)
    # 消除所有全形符號
    rule = re.compile('[^\u4e00-\u9fa5]')
    line = rule.sub(' ', line)
    return line

def remove_redundant_space(line):
    line = re.sub(' +', ' ', line)
    return line

# 詞法分析，載入 jieba 模組

import jieba

# 有必要的話載入常用辭典
jieba.set_dictionary('dict.txt.big')
# 有必要的話載入專屬字典
jieba.load_userdict('user.txt')

# 無意義詞彙過濾（stopwords，虛字與其它無意義詞彙）

stopwords = [ '之', '乎', '者', '也', '的' ]

def remove_stopword(lst0):
    lst = []
    for x in lst0:
        if x not in stopwords:
            lst.append(x)
    return lst

# 詞法分析，函數定義

def lexical_analyzer(txt):
    txt = remove_non_chinese(txt)
    txt = remove_redundant_space(txt)
    res = jieba.cut(txt)
    lst = [ x for x in res ]
    lst = remove_stopword(lst)
    doc = ' '.join(lst)
    doc = re.sub(' +', ' ', doc)
    return doc

# 整理 Corpus 資料格式

import glob
import pickle

files = glob.glob('朱自清散文/*.txt')

corpus = dict()

for f in files:
    with open(f, 'r', encoding='utf-8') as fp:
        txt = fp.read()
    fp.close()
    corpus[f] = { 'name':f, 'txt':txt, 'doc':None }

# 儲存語料庫

for f in corpus:
    txt = corpus[f]['txt']
    doc = lexical_analyzer(txt)
    corpus[f]['doc'] = doc

with open('corpus_朱自清散文.pkl', 'wb') as fp:
    pickle.dump(corpus, fp)
fp.close()


<hr>
<h3 style="color:orange">測試</h3>

In [None]:
# 測試

# 載入語料庫

with open('corpus_朱自清散文.pkl', 'rb') as fp:
    corpus = pickle.load(fp)
fp.close()

# TF-IDF 文件檢索

from sklearn.feature_extraction.text import TfidfVectorizer

qry = '月台+橘子'

vectorizer = TfidfVectorizer()

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

d = dict()
for f in corpus:
    text1 = lexical_analyzer(qry)
    text2 = corpus[f]['doc']
    c = cosine_sim(text1, text2)
    d[f] = c

# 字典排序（成為串列）
lst = sorted(d.items(), key=lambda x: x[1], reverse=True)

# 顯示排序結果
print('查詢：', qry)
for f, c in lst:
    t = '關連度：%12.10f，文章：%s' % (c, f)
    print(t)
