# 自製智能中文選字系統  (1)

In [112]:
segments = []
with open('./wiki_zh_small.txt') as fr:
    for line in fr.readlines():
        segments += prepocess_line(line)

In [113]:
import sys
sys.version

'3.7.3 (default, Mar 27 2019, 16:54:48) \n[Clang 4.0.1 (tags/RELEASE_401/final)]'

## 資料前處理

確認版本為 python3

In [114]:
import re

In [115]:
def prepocess_line(line):
    # 僅僅挑出中文字元，並且斷開不連續的中文字
    regex = r"[\u4E00-\u9FCC]+"
    segments = re.findall(regex, line)
    return segments

In [116]:
prepocess_line('“英語”一詞源於遷居英格蘭的日耳曼部落盎格魯（），而“盎格魯”得名於')  
# 應該為：['英語', '一詞源於遷居英格蘭的日耳曼部落盎格魯', '而', '盎格魯', '得名於']

['英語', '一詞源於遷居英格蘭的日耳曼部落盎格魯', '而', '盎格魯', '得名於']

## Ngram

一開始要先計算字詞出現的次數

In [117]:
from collections import Counter

class Counters:
    def __init__(self, n):
        self.n = n
        self.counters = [Counter() for _ in range(n + 1)]  # 分別代表計算0、1、...個字的出現次數

    def fit(self, segments):
        # 因為 self.counters 分別代表計算0、1、...個字的出現次數
        # 請在此實作利用 segments 以及函式 _skip 來統計次數
        for i in range(1, 1 + self.n):
            for segment in segments:
                self.counters[i] += Counter(self._skip(segment, i))

    def __getitem__(self, k):
        return self.counters[k]

    def _skip(self, segment, n):
        assert n > 0
        if len(segment) < n:
            return []
        shift = n - 1
        for i in range(len(segment) - shift):
            yield segment[i:i+shift+1]

In [118]:
counters = Counters(n=3)
counters.fit(segments)

In [119]:
counters.counters[1]
# 應該為： Counter({'': 371373})

Counter({'英': 420,
         '語': 1416,
         '又': 210,
         '稱': 724,
         '爲': 3637,
         '文': 1439,
         '是': 3600,
         '一': 3659,
         '種': 891,
         '西': 1266,
         '日': 1066,
         '耳': 49,
         '曼': 97,
         '言': 621,
         '誕': 22,
         '生': 1116,
         '於': 2022,
         '中': 3944,
         '世': 828,
         '紀': 320,
         '早': 211,
         '期': 769,
         '的': 13270,
         '格': 283,
         '蘭': 213,
         '如': 805,
         '今': 327,
         '具': 319,
         '有': 3252,
         '全': 994,
         '球': 278,
         '通': 659,
         '用': 1847,
         '地': 1756,
         '位': 701,
         '詞': 410,
         '源': 357,
         '遷': 89,
         '居': 282,
         '部': 1009,
         '落': 75,
         '盎': 16,
         '魯': 112,
         '而': 1055,
         '得': 599,
         '名': 824,
         '臨': 86,
         '波': 102,
         '羅': 304,
         '海': 844,
         '半': 207,
         '島': 341,
  

In [120]:
class Ngram:
    def __init__(self, n, counters):
        assert n <= counters.n
        self.n = n
        self.major_counter = counters[n]
        self.minor_counter = counters[n-1]

    def predict_proba(self, prefix='', top_k=5):
        assert len(prefix) >= self.n - 1
        # 使用 Ngram 的公式計算出下一個字出現的機率
        # 輸出為機率與字的tuple列表，詳見下方輸出範例
        reference = prefix[-(self.n - 1):] if self.n > 1 else ''
        count_referecne = self.minor_counter[reference]
        probs = []
        for key, count in dict(self.major_counter).items():
            if key.startswith(reference):
                prob = count / count_referecne
                probs.append((prob, key[-1]))
        sorted_probs = sorted(probs, reverse=True)
        
        # END YOUR CODE
        return sorted_probs[:top_k] if top_k > 0 else sorted_probs

    def get_proba_dict(self, prefix=''):
        return {word: prob for prob, word in self.predict_proba(prefix, top_k=-1)}


In [121]:
unigram = Ngram(1, counters)

In [122]:
unigram.predict_proba('我思')
# 應該為：[(0.035732269174118744, '的'),
#         (0.012927703414087723, '國'),
#         (0.010620050461395955, '中'),
#         (0.009984570768472667, '在'),
#         (0.009852627950874188, '一')]

ZeroDivisionError: division by zero

In [None]:
bigram = Ngram(2, counters)
trigram = Ngram(3, counters)

## 使用Ngram來建立第一版選字系統

In [None]:
class ChineseWordRecommenderV1:
    def __init__(self, unigram, bigram, trigram):
        self.unigram = unigram
        self.bigram = bigram
        self.trigram = trigram
    
    def predict_proba(self, prefix='', top_k=5):
        # 使用Ngram來建立選字系統
        if len(prefix) >= 2:
            return self.trigram.predict_proba(prefix, top_k)
        elif len(prefix) >= 1:
            return self.bigram.predict_proba(prefix, top_k)
        elif len(prefix) == 0
            return self.unigram.predict_proba(prefix, top_k)

In [None]:
model = ChineseWordRecommenderV1(unigram, bigram, trigram)

In [None]:
probs = model.predict_proba('我思', top_k=10)
probs

[(0.75, '故'), (0.25, '維')]

## Demo

In [None]:
!pip install -U pip
!pip install -q ipywidgets

Requirement already up-to-date: pip in /Users/ycchen/.pyenv/versions/3.6.5/lib/python3.6/site-packages (20.1.1)


In [None]:
import ipywidgets as widgets

text = widgets.Textarea()
label = widgets.Label()
display(label, text)

def func(change):
    probs = model.predict_proba(change.new, top_k=10)
    label.value = ' ' + '\t'.join([word for prob, word in probs])

text.observe(func, names='value')

Label(value='')

Textarea(value='')