# 自製智能中文選字系統  (1)

## 資料前處理
僅僅挑出中文字元，並且斷開不連續的中文字

In [1]:
import re

def prepocess_line(line):
    pattern = r'[\u4E00-\u9FFF\u3400-\u4DBF]+'
    return re.findall(pattern,line)

In [2]:
prepocess_line('“英語”一詞源於遷居英格蘭的日耳曼部落盎格魯（），而“盎格魯”得名於')  

['英語', '一詞源於遷居英格蘭的日耳曼部落盎格魯', '而', '盎格魯', '得名於']

In [3]:
#讀取訓練資料
segments = []

with open('./wiki_zh_small.txt', encoding='utf-8') as fr:
    for line in fr.readlines():
        segments += prepocess_line(line)

## Ngram

一開始要先計算字詞出現的次數

In [4]:
from collections import Counter

class Counters:
    def __init__(self, n):
        self.n = n
        self.counters = [Counter() for _ in range(n + 1)]  # 分別代表計算0、1、...個字的出現次數

    def fit(self, segments):
        # 因為 self.counters 分別代表計算0、1、...個字的出現次數
        for seg in segments:
            self.counters[0].update([''] * len(seg))
            for n in range(1,self.n + 1):
                self.counters[n].update(self._skip(seg, n))

    def __getitem__(self, k):
        return self.counters[k]

    def _skip(self, segment, n):
        assert n > 0
        if len(segment) < n:
            return
        shift = n - 1
        for i in range(len(segment) - shift):
            yield segment[i:i+shift+1]

In [5]:
counters = Counters(n=3)
counters.fit(segments)

In [6]:
display(counters[3].most_common(10))
# 應該為： Counter({'': 371373})

[('西班牙', 225),
 ('聯合國', 212),
 ('共和國', 212),
 ('人民共', 188),
 ('民共和', 188),
 ('中華人', 174),
 ('華人民', 173),
 ('新加坡', 172),
 ('中華民', 129),
 ('是中國', 119)]

In [7]:
class Ngram:
    def __init__(self, n, counters):
        assert n <= counters.n
        self.n = n
        self.major_counter = counters[n]
        self.minor_counter = counters[n-1]

    def predict_proba(self, prefix='', top_k=5):
        assert len(prefix) >= self.n - 1
        
        # 使用 Ngram 的公式計算出下一個字出現的機率
        # 輸出為機率與字的tuple列表，詳見下方輸出範例
        prefix = prefix[1-self.n:] if 1-self.n  else ''
        probas=[]

        for word in self.major_counter.keys():
            if word.startswith(prefix):
                probas.append((self.major_counter.get(word)/self.minor_counter.get(prefix), word[-1]))

        sorted_probas=sorted(probas, reverse = True)
        
        return sorted_probas[:top_k] if top_k > 0 else sorted_probas

    def get_proba_dict(self, prefix=''):
        return {word: prob for prob, word in self.predict_proba(prefix, top_k=-1)}

In [8]:
unigram = Ngram(1, counters)

In [9]:
unigram.predict_proba('我思')
# 應該為：[(0.035732269174118744, '的'),
#         (0.012927703414087723, '國'),
#         (0.010620050461395955, '中'),
#         (0.009984570768472667, '在'),
#         (0.009852627950874188, '一')]

[(0.03573198052647451, '的'),
 (0.012927598983240704, '國'),
 (0.010619964671922796, '中'),
 (0.009984490112446684, '在'),
 (0.00985254836069105, '一')]

In [10]:
bigram = Ngram(2, counters)
trigram = Ngram(3, counters)

## 使用Ngram來建立第一版選字系統

In [11]:
class ChineseWordRecommenderV1:
    def __init__(self, unigram, bigram, trigram):
        self.unigram = unigram
        self.bigram = bigram
        self.trigram = trigram
    
    def predict_proba(self, prefix='', top_k=5):
        # 使用Ngram來建立選字系統
        probs = Counter({word: prob for prob, word in self.unigram.predict_proba(prefix, top_k)})
        probs.update({word: prob for prob, word in (self.bigram.predict_proba(prefix, top_k) if len(prefix) > 0 else [])})
        probs.update({word: prob for prob, word in (self.trigram.predict_proba(prefix, top_k) if len(prefix) > 1 else [])})
        return [(prob, word) for word, prob in (probs.most_common(top_k) if top_k > 0 else probs.most_common())]
            

In [12]:
model = ChineseWordRecommenderV1(unigram, bigram, trigram)

In [13]:
probs = model.predict_proba('我思', top_k=10)
probs

[(0.7665745856353591, '故'),
 (0.34944751381215466, '維'),
 (0.3370165745856354, '想'),
 (0.12154696132596685, '考'),
 (0.05389257474844633, '是'),
 (0.04678170428338059, '的'),
 (0.03867403314917127, '汗'),
 (0.03189275672065374, '爲'),
 (0.012927598983240704, '國'),
 (0.011049723756906077, '聰')]

## Demo

In [14]:
import ipywidgets as widgets
from IPython.display import clear_output

text = widgets.Textarea()
label = widgets.Label()
display(label, text)

def func(change):
    probs = model.predict_proba(change.new, top_k=10)
    label.value = ' ' + '\t'.join([word for prob, word in probs])

text.observe(func, names='value')

Label(value='')

Textarea(value='')