# 实验一 language Model
* 实现基本语言模型 unigram 和 bigram 加入平滑技术
* 实现评测ppl

In [185]:
from nltk.tokenize import WordPunctTokenizer
import os
import math
import re

## 数据预处理
* 分句，分词，储存词典
* 句子储存
* 去标点符号

In [443]:
punc = '[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+'


f_train = open("E1/train_LM.txt")
f_train_lines = f_train.readlines()
trains = []
for l in f_train_lines:
    trains.extend(l.lower().strip().split("__eou__")[:-1])
trains = [WordPunctTokenizer().tokenize(re.sub(punc," ",l)) for l in trains]

f_test = open("E1/test_LM.txt")
f_test_lines = f_test.readlines()
tests = []
for l in f_test_lines:
    tests.extend(l.lower().strip().split("__eou__")[:-1])
tests = [WordPunctTokenizer().tokenize(re.sub(punc," ",l)) for l in tests]

f_train.close()
f_test.close()

In [444]:
len(tests)

1208

In [445]:
vocab = {"i":1}
for l in trains:
    for w in l:
        if w in vocab:
            vocab[w]+=1
        else:
            vocab[w] = 1
print(len(vocab))
for l in tests:
    for w in l:
        if w in vocab:
            vocab[w]+= 0
        else:
            vocab[w] = 0
print(len(vocab))

18702
18702


In [446]:
vocab

{'raining': 36,
 'unscientific': 2,
 'systematic': 1,
 'hordes': 1,
 'dubbed': 2,
 'yellow': 51,
 'four': 346,
 'prices': 133,
 'prefix': 1,
 'woods': 15,
 'spiders': 5,
 'preface': 2,
 'woody': 2,
 'cyprus': 1,
 'payoff': 1,
 'looking': 690,
 'canes': 1,
 'pigged': 1,
 'eligible': 10,
 'electricity': 25,
 'chatter': 3,
 'liangs': 1,
 'billing': 1,
 'hmmmm': 1,
 'originality': 2,
 'methone': 2,
 'demoted': 2,
 'lord': 8,
 'immature': 1,
 'flicking': 2,
 'lora': 2,
 'shaving': 3,
 'sinking': 1,
 'digit': 2,
 'degradable': 1,
 'bruschetta': 1,
 'deli': 4,
 'oceans': 5,
 'costume': 12,
 'dell': 1,
 'forties': 3,
 'quiting': 4,
 'foul': 11,
 'taj': 5,
 'dely': 1,
 'politician': 3,
 'stabbed': 3,
 'screaming': 8,
 'advices': 1,
 'disturb': 11,
 'basics': 11,
 'scholar': 9,
 'yachting': 1,
 'wooden': 20,
 'auditor': 2,
 '4n9': 1,
 'wednesday': 72,
 'oooo': 4,
 'lingy': 2,
 'specialties': 8,
 'barathea': 1,
 'oooh': 1,
 'xiaohui': 1,
 'ornamental': 1,
 'charter': 2,
 'specially': 19,
 'nigh':

## 语言模型之unigram
* 对数据进行加一平滑

In [447]:
class unigram(object):
    def __init__(self,trains,tests):
        self.vocab = {}
        self.trains = trains
        self.tests = tests
        self.word2prob = {}
        word = 0
        for l in trains:
            for w in l:
                word+=1
                if w in self.vocab:
                    self.vocab[w]+=1
                else:
                    self.vocab[w] = 1
        for l in tests:
            for w in l:
#                 word+=1
                if w in self.vocab:
                    self.vocab[w]+= 0
                else:
                    self.vocab[w] = 0 ## 注意这里
        keys=list(self.vocab.keys())
        V = len(self.vocab)
        for k in keys:
            self.word2prob[k] = math.log(float((1 + self.vocab[k]))/(word+V))
        
    def sentence2logprob(self,sentence):
        result = 0
        if len(sentence) == 0:
            return 0
        for token in sentence:
            result -= self.word2prob[token]
        return result/len(sentence)
    
    def ppl(self,sentence):
        return math.exp(self.sentence2logprob(sentence))

In [448]:
u = unigram(trains,tests)

### unigram 测试（ppl）

In [449]:
u.sentence2logprob(tests[1])

6.070035408935989

In [450]:
uppls = []
for t in tests:
    if len(t) == 0:
        continue
    uppls.append(u.ppl(t))
sum(uppls)/len(uppls)

842.1962168575029

In [451]:
ks = u.word2prob.keys()
wordp = 0
for k in ks:
    wordp += math.exp(u.word2prob[k])    
wordp

0.9999999999999731

## 语言模型之bigram
* 对数据进行加一平滑

In [452]:
class bigram(object):
    def __init__(self,trains,tests):
        self.vocab = {} ## 二元组 统计词频
        self.uvocab = {} ## 单元组 统计词频
        self.vvocab = {}## 一个词 后面出现的所有词的种类 （wi-1,wi） // wi-1 确定 wi的种类
        self.trains = trains
        self.tests = tests
        self.word2prob = {}
        for l in trains:
            for i in range(len(l)-1):
                if (l[i],l[i+1]) in self.vocab:
                    self.vocab[(l[i],l[i+1])] += 1;
                else:
                    self.vocab[(l[i],l[i+1])] = 1
                    if l[i] in self.vvocab: ## 注意这里
                        self.vvocab[l[i]] += 1
                    else:
                        self.vvocab[l[i]] = 1
                if l[i] in self.uvocab:
                    self.uvocab[l[i]] += 1
                else:
                    self.uvocab[l[i]] = 1
        for l in tests:
            for i in range(len(l)-1):
                if (l[i],l[i+1]) in self.vocab:
                    self.vocab[(l[i],l[i+1])]+= 0
                else:
                    self.vocab[(l[i],l[i+1])] = 0
                    
                    if l[i] in self.vvocab:
                        self.vvocab[l[i]] += 1
                    else:
                        self.vvocab[l[i]] = 1
                if l[i] not in self.uvocab:
                    self.uvocab[l[i]] = 0 ## 注意这里
        keys=list(self.vocab.keys())
        V = len(self.vocab)
        
        for k in keys:
            self.word2prob[k] = math.log(float((1 + self.vocab[k]))/(self.uvocab[k[0]]+self.vvocab[k[0]]))
    
    def sentence2logprob(self,sentence):
        result = 0
        for i in range(len(sentence)-1):
            result -= self.word2prob[(sentence[i],sentence[i+1])]
        return result/(len(sentence)-1)
    
    def ppl(self,sentence):
        return math.exp(self.sentence2logprob(sentence))

In [453]:
b = bigram(trains,tests)

215436
215436


In [454]:
bppls = []
for t in tests:
    if len(t) <= 1:
        continue
    bppls.append(b.ppl(t))
sum(bppls)/len(bppls)

63.88419091372761

In [455]:
keys = list(b.word2prob.keys())

math.exp(b.word2prob[('i','am')])

0.041388737014762166