# python实现n-gram

本例子主要受 Michael Collins 教授的 Language Modeling 启发而编写，为了帮助大家理解语言模型，我在我的博客、公众号上发表了文章[一文读懂NLP中的语言模型(公众号)](http://mp.weixin.qq.com/s?__biz=MzIwNDM1NjUzMA==&mid=2247483658&idx=1&sn=9c5e7cc50b65cf31a08f1e2a0046ceb1&chksm=96c02fd7a1b7a6c1bbabe19145665d370020f4a3e89ebdc1226a1ec4ed110ef089c6fb0212c4&mpshare=1&scene=1&srcid=1114A1PGK4rDqKMMbsAmplr3#rd)，欢迎大家阅读。当然强烈推荐[Michael Collins 教授的 Language Modeling 原文](http://www.cs.columbia.edu/~mcollins/lm-spring2013.pdf)

## 目录

1. [项目结构](#项目结构)

2. [环境要求](#环境要求)

3. [代码分析](#代码分析)

4. [结果分析](#结果分析)

5. [项目后续](#项目后续)

6. [联系作者](#联系作者)

### 项目结构

| - src
    
    | - const.py      常量定义文件
    
    | - corpus        语料库
    
    | - dataset.py    加载语料
    
    | - evaluate.py   模型的评估方法
    
    | - main.py       例子程序
    
    | - ngram.py      ungram, bigram, trigram 模型，以及一些模型方法
    
    | - processing.py 字典的生成等处理方法
    
    | - smooth.py     平滑方法

## 环境要求

    python3

## 代码分析

### const.py

在这里定义了三个常量

In [1]:
# 未登录词
UNK = None
# 句子开始标记，代表句子的开头
START_TOKEN = '<s>'
# 句子结束标记，代表句子的结尾
END_TOKEN = '</s>'

### processing.py

In [None]:
import const

#加入起始标记
def build_sentences(sentences):
        out = []
        for sentence in sentences:
                words = [x.lower() for x in sentence]
                words.insert(0, "<s>")
                words.append("</s>")
                out.append(words)
        return out

# 构建ungram词频词典
def build_undict(sentences):
        undict = {}
        total = 0
        for words in sentences:
                for word in words:
                        if word not in undict:
                                undict[word] = 1
                        else:
                                undict[word] += 1
                        if word != const.START_TOKEN and word != const.END_TOKEN:
                                total += 1
        return undict, total

# 构建bigram词频词典，其中以三元组(u, v)作为词典的键
def build_bidict(sentences):
    bidict = {}
    for words in sentences:
            for i in range(len(words)-1):
                    tup = (words[i], words[i+1])
                    if tup not in bidict:
                            bidict[tup] = 1
                    else:
                            bidict[tup] += 1
    return bidict

# 构建trigram词频词典，其中以三元组(u, v, w)作为词典的键
def build_tridict(sentences):
        tridict = {}
        for words in sentences:
                for i in range(len(words) -2):
                        tup = (words[i], words[i+1], words[i+2])
                        if tup not in tridict:
                                tridict[tup] = 1
                        else:
                                tridict[tup] += 1
        return tridict

### ngram.py

n-gram模型，实现了ungram, bigram, trigram

In [None]:
import math
import const
from processing import *

'''
@function calc_prob 			计算条件概率，这里使用最大似然估计(max-likelihood estimate)去计算概率
@function calc_sentence_prob	计算句子的条件概率
'''
class UnGram(object):
	def __init__(self, sentences, smooth = None):
		self.undict, self.total = build_undict(sentences)
		self.smooth = smooth

	def calc_prob(self, word):
		prob = 0
		if self.smooth != None:
			prob = self.smooth(word, undict=self.undict, total=self.total)
		else:
			if word in self.undict:
				prob = float(self.undict[word]) / self.total
		return prob

	def calc_sentence_prob(self, sentence, prob_log=True):
		prob_log_sum = 0
		for word in sentence:
			if word != const.START_TOKEN and word != const.END_TOKEN:
				word_prob = self.calc_prob(word)
				if word_prob != 0:
					prob_log_sum += math.log(word_prob, 2)
		return math.pow(2, prob_log_sum) if prob_log else prob_log_sum

	def sort_vocab(self):
		vocabs = list(self.undict.keys())
		vocabs.remove(const.START_TOKEN)
		vocabs.remove(const.END_TOKEN)
		vocabs.sort()
		vocabs.append(const.UNK)
		vocabs.append(const.START_TOKEN)
		vocabs.append(const.END_TOKEN)
		return vocabs

class BiGram(UnGram):
	def __init__(self, sentences, smooth = None):
		UnGram.__init__(self, sentences, smooth)
		self.bidict = build_bidict(sentences)

	def calc_prob(self, *args):
		if len(args) != 2:
			raise ValueError('two words is required')

		prob = 0
		if self.smooth != None:
			prob = self.smooth(args[0], args[1], bidict=self.bidict, undict=self.undict)
		else:
			if args in self.bidict and args[0] in self.undict:
				return float(self.bidict[args]) / self.undict[args[0]]
		return prob

	def calc_sentence_prob(self, sentence, prob_log=True):
		prob_log_sum = 0
		prev_word = None
		for word in sentence:
			if prev_word != None:
				word_prob = self.calc_prob(prev_word, word)
				prob_log_sum += word_prob
			prev_word = word
		return math.pow(2, prob_log_sum) if prob_log else prob_log_sum


class TriGram(BiGram):
	def __init__(self, sentences, smooth = None):
		BiGram.__init__(self, sentences, smooth)
		self.tridict = build_tridict(sentences)

	def calc_prob(self, *args):
		if len(args) != 3:
			raise ValueError('three words is required')

		prob = 0
		if self.smooth != None:
			prob = self.smooth(args[0], args[1], args[2], tridict=self.tridict, bidict=self.bidict, undict=self.undict)
		else:
			bitup = (args[0], args[1])				
			if args in self.tridict and bitup in self.bidict:
				return float(self.tridict[args]) / self.bidict[bitup]
		return prob

	def calc_sentence_prob(self, sentence, prob_log=True):
		prob_log_sum = 0
		prev_stack = []
		for word in sentence:
			if len(prev_stack) < 2:
				prev_stack.append(word)
			elif len(prev_stack) == 2:
				word_prob = self.calc_prob(prev_stack[0], prev_stack[1], word)
				prob_log_sum += word_prob
				prev_stack[0] = prev_stack[1]
				prev_stack[1] = word
		return math.pow(2, prob_log_sum) if prob_log else prob_log_sum

'''
@function: calc_xxgram_count   主要用来统计语料库中词的总数
@function: print_xxgram_probas 格式化输出概率 
'''
class GramUtil(object):

	@staticmethod
	def calc_ungram_count(sentences):
		count = 0
		for sentence in sentences:
			# except START_TOKEN and END_TOKEN
			count += len(sentence) - 2
		return count

	@staticmethod
	def calc_bigram_count(sentences):
		count = 0
		for sentence in sentences:
			count += len(sentence) - 1
		return count

	@staticmethod
	def calc_trigram_count(sentences):
		count = 0
		for sentence in sentences:
			count += len(sentence)
		return count

	@staticmethod
	def print_ungram_probs(model, vocabs):
		for vocab in vocabs:
			if vocab != const.START_TOKEN and vocab != const.END_TOKEN:
				print("{} \t {}".format(vocab if vocab != const.UNK else 'UNK', model.calc_prob(vocab)))

	@staticmethod
	def print_bigram_probs(model, vocabs):
		print("\t\t", end="")
		for vocab in vocabs:
			if vocab != const.START_TOKEN:
				print(vocab if vocab != const.UNK else "UNK", end="\t\t")
		print("")
		for vocab in vocabs:
			if vocab != const.END_TOKEN:
				print(vocab if vocab != const.UNK else "UNK", end="\t\t")
				for vocab2 in vocabs:
					if vocab2 != const.START_TOKEN:
						print("{0:.3f}".format(model.calc_prob(vocab, vocab2)), end="\t\t")
				print("")

	@staticmethod
	def print_trigram_probs(model, vocabs):
		print("\t\t", end="")
		for vocab in vocabs:
			if vocab != const.START_TOKEN:
				print(vocab if vocab != const.UNK else "UNK", end="\t")
		print("")
		for vocab in vocabs:
			if vocab != const.END_TOKEN:
				for vocab2 in vocabs:
					if vocab2 != const.START_TOKEN and vocab != const.UNK and vocab2 != const.UNK and vocab2 != const.END_TOKEN:
						print(vocab, vocab2 if vocab2 != const.UNK else "UNK", end="\t\t")
						for vocab3 in vocabs:
							if vocab3 != const.END_TOKEN
								print("{0:.3f}".format(model.calc_prob(vocab, vocab2, vocab3)), end="\t")
						print("")


### evaluate.py

模型的评估，这里主要用了困惑度Perplexity

In [None]:
import math

# 计算困惑度
def perplexity(model, sentences, cal_gram_func):
    # gram_count 词的总数，对应教程中的 M
	gram_count = cal_gram_func(sentences)
	prob_log_sum = 0
	for sentence in sentences:
		try:
			prob_log_sum -= math.log(model.calc_sentence_prob(sentence), 2)
		except:
			prob_log_sum -= float('-inf')
		return math.pow(2, prob_log_sum/gram_count)

## 结果分析

<table class="table table-bordered">  
    <tr>  
        <td>**#**</td>  
        <td>**smooth**</td>
        <td>**unsmooth**</td>
    </tr>  
    <tr>  
        <td>你好不</td>
        <td>2.99167</td>  
        <td>3.97368</td>  
    </tr>  
    <tr>  
        <td>好不你</td>
        <td>1.10409</td>  
        <td>1.21901</td>  
    </tr>  
    <tr>  
        <td>你是不</td>
        <td>1.75263</td>  
        <td>2.06712</td>  
    </tr>  
</table>  
<table class="table table-bordered">  
    <tr>  
        <td>**#**</td>  
        <td>**smooth**</td>
        <td>**unsmooth**</td>
    </tr>  
    <tr>  
        <td>Perplexity</td>
        <td>0.91272</td>  
        <td>0.89138</td>  
    </tr>  
</table>  

## 项目后续

过段时间会加入深度学习在语言模型上的应用，如果你感兴趣，可以关注我的公众号，或者star, watch 本项目哦

## 联系作者

@author sean

@qq  929325776

有什么问题，可以联系我，一起讨论