# 自製智能中文選字系統  (2)

## 資料前處理

In [1]:
import re

In [2]:
def prepocess_line(line):
    pattern = r'[\u4E00-\u9FFF\u3400-\u4DBF]+'
    return re.findall(pattern,line)

In [3]:
#讀取訓練資料
segments = []

with open('./wiki_zh_small.txt', encoding='utf-8') as fr:
    for line in fr.readlines():
        segments += prepocess_line(line)

## 斷詞

In [4]:
import jieba
jieba.setLogLevel(jieba.logging.INFO)
jieba.set_dictionary('./dict.txt.big')

import warnings #忽略警告
warnings.filterwarnings('ignore')

In [5]:
list(jieba.cut_for_search(segments[6001]))

['所以', '僅', '用於', '還原', '一些', '貴重', '的', '化合', '化合物']

In [6]:
cut_segments = []
for seg in segments:
# 使用結巴斷詞的 cut_for_search
    cut_segments += jieba.lcut_for_search(seg)

In [7]:
print(len(cut_segments))

238327


## 使用斷詞的結果來作Ngram

In [8]:
from collections import Counter

class Counters:
    def __init__(self, n):
        self.n = n
        self.counters = [Counter() for _ in range(n + 1)]

    def fit(self, segments):
        for seg in segments:
            self.counters[0].update([''] * len(seg))
            for n in range(1,self.n + 1):
                self.counters[n].update(self._skip(seg, n))

    def __getitem__(self, k):
        return self.counters[k]

    def _skip(self, segment, n):
        assert n > 0
        if len(segment) < n:
            return []
        shift = n - 1
        for i in range(len(segment) - shift):
            yield segment[i:i+shift+1]

In [9]:
counters = Counters(n=5)
counters.fit(cut_segments)

In [10]:
class Ngram:
    def __init__(self, n, counters):
        assert n <= counters.n
        self.n = n
        self.major_counter = counters[n]
        self.minor_counter = counters[n-1]

    def predict_proba(self, prefix='', top_k=5):
        assert len(prefix) >= self.n - 1

        reference = prefix[-(self.n - 1):] if self.n > 1 else ''
        count_referecne = self.minor_counter[reference]
        probs = []
        for key, count in dict(self.major_counter).items():
            if key.startswith(reference):
                prob = count / count_referecne
                probs.append((prob, key[-1]))
        sorted_probs = sorted(probs, reverse=True)
        return sorted_probs[:top_k] if top_k > 0 else sorted_probs

    def get_proba_dict(self, prefix=''):
        return {word: prob for prob, word in self.predict_proba(prefix, top_k=-1)}

In [11]:
ngrams = [Ngram(i, counters) for i in range(1, 6)]

## 使用Smoothing of Language Models來建立第二版選字系統

In [12]:
class ChineseWordRecommenderV2:
    def __init__(self, ngrams):
        self.ngrams = ngrams        
    
    # 在此你可以選擇兩種 Smoothing of Language Models 的方法：
    # Back-off Smoothing 或 Interpolation Smoothing
    # 如果你選擇 Interpolation Smoothing，你可以運用已經準備好的輔助函式
    # _get_interpolation_proba 來達到此目的
    def predict_proba(self, prefix='', top_k=5, mode = 'interpolation'):
        assert mode in ['interpolation', 'backoff'], \
        "Smoothing mode can be 'interpolation' or 'backoff'"
        
        probas=[]
        proba_dicts = [ngram.get_proba_dict(prefix) for ngram in ngrams[:len(prefix)+1]]
        interpolation_lambda = 0.99
        for word in proba_dicts[0].keys():
            proba = self._get_interpolation_proba(word, proba_dicts, interpolation_lambda) \
                    if mode =='interpolation' else self._get_backoff_proba(word, proba_dicts)
            probas.append((proba, word))

        sorted_probas=sorted(probas, reverse = True)
        
        return sorted_probas[:top_k] if top_k > 0 else sorted_probas
    
    def _get_interpolation_proba(self, word, proba_dicts, interp_lambda, idx=None):
        if idx is None:
            idx = len(proba_dicts) - 1
        if idx == 0:
            return proba_dicts[idx].get(word, 0.)
        return interp_lambda * proba_dicts[idx].get(word, 0.) + \
               (1 - interp_lambda) * self._get_interpolation_proba(word, proba_dicts, interp_lambda, idx=idx-1)
    
    def _get_backoff_proba(self, word, proba_dicts, idx=None):
        if idx is None:
            idx = len(proba_dicts) - 1
        if idx == 0:
            return proba_dicts[idx].get(word, 0.)
        return proba_dicts[idx].get(word, 0.) if proba_dicts[idx].get(word, 0.) != 0 else \
               1 - 1/(1 - self._get_backoff_proba(word, proba_dicts, idx=idx-1))

In [13]:
model = ChineseWordRecommenderV2(ngrams)

In [14]:
probs = model.predict_proba('法', top_k=10, mode='interpolation')
probs

[(0.0508064389177302, '國'),
 (0.039758859938119194, '律'),
 (0.031190676480541715, '院'),
 (0.027291194945539974, '蘭'),
 (0.017965764301742657, '語'),
 (0.010204453506869571, '學'),
 (0.009363682689682004, '則'),
 (0.007055799389643616, '西'),
 (0.006269763207217874, '部'),
 (0.0062449282552439625, '規')]

## Demo

In [15]:
import ipywidgets as widgets

text = widgets.Textarea()
label = widgets.Label()
display(label, text)

def func(change):
    probs = model.predict_proba(change.new, top_k=10)
    label.value = ' ' + '\t'.join([word for prob, word in probs])

text.observe(func, names='value')

Label(value='')

Textarea(value='')