### Language Model

In [33]:
import jieba
from collections import Counter
import numpy as np

In [23]:
# 新华社新闻语料地址
corpus_path = '../article_9k.txt'

In [24]:
# 读取语料
FILE = open(corpus_path).read()

### unigram model

$$ P(w_1,w_2...w_i) = \prod_{i=1}^n \frac{C(w_i)+1}{C(w)+|V|}$$

#### |V| 语料中 unigram 的种类数

In [27]:
TOKENS = jieba.lcut(FILE)

In [28]:
token_counts = Counter(TOKENS)

In [29]:
unigram_counts = len(set(TOKENS))

In [30]:
def get_unigram_count(word):
    if word in token_counts:
        word_count = token_counts[word]
    else:
        word_count = 0
    return word_count

In [31]:
def unigram_model(string):
    str_list = jieba.lcut(string)
    output = 0
    for w in str_list:
        w_count = get_unigram_count(w)
        pw = (w_count + 1) / (len(TOKENS) + unigram_counts)
        
        # 因为pw是小于1的数，当连乘较多时，数值会非常小，甚至超出电脑所能表示的浮点数的精度，所以进行log处理，将连乘转变为和的形式
        output += np.log10(pw)
    
    return output

In [34]:
# str A
unigram_model('白石麻衣天下第一')

-19.035334517357814

In [35]:
# str B
unigram_model('白麻衣石天第一下')

-25.60952943132795

In [36]:
# str A
unigram_model('我一定要成为自然语言处理大师')

-26.437986041505344

In [39]:
# str B
unigram_model('我一样要吃饭自然语言处理大师')

-28.179289708939375

In [91]:
# 对比发现str A 的值较高而且实际中也是str A 更加符合正常语法规则更加通顺一点

### bigram model

$$ Pr(sentence)  = Pr( w_1 \cdot w_2 \cdots w_n) = \prod \frac{count(w_i,w_{i+1})+1} {count(w_i)+|V|} $$

In [42]:
bigram_TOKENS = [TOKENS[i] + TOKENS[i+1] for i in range(len(TOKENS)-1)]

In [43]:
bigram_token_counts = Counter(bigram_TOKENS)

In [45]:
bigram_counts = len(set(bigram_TOKENS))

In [46]:
def get_bigram_count(word):
    if word in bigram_token_counts:
        word_count = bigram_token_counts[word]
    else:
        word_count = 0
    return word_count

In [47]:
def bigram_model(string):
    str_list = jieba.lcut(string)
    output = 0
    for i in range(len(str_list)-1):
        w = str_list[i]
        w_next = str_list[i+1]
        two_gram_w = w + w_next
        two_gram_pw = (get_bigram_count(two_gram_w) + 1) / (len(bigram_TOKENS) + bigram_counts)
        one_gram_pw = (get_unigram_count(w)+1) / (len(TOKENS) + unigram_counts)
        
        pro = two_gram_pw / one_gram_pw
        
        output += np.log10(pro)
        
    return output

In [48]:
bigram_model('谁会买小米手机')

-9.897996308578186

In [49]:
bigram_model('谁会买小米汽车')

-10.897996308578186

In [50]:
bigram_model('python是世界上最好的语言')

-11.237589486313484

In [56]:
bigram_model('python是世界上最好的跑车')

-13.527624097676002