In [17]:
import numpy as np

In [18]:
class HMM_BIO_method3:
    def __init__(self,filename='corpus/corpus_1.txt'):
        
        self.filename=filename
        # 初始化词典
        self.tag2id, self.id2tag  = {},{} # tag 表示实体标签，如 B-LOC E-LOC 等
        self.word2id, self.id2word = {},{} # word 表示中文字符

        # 根据数据建立词典
        for line in open("corpus/corpus_1.txt",encoding='utf-8'):
            if line == "\n":
                continue
            word, tag = line[0],line[2:-1]

            if word not in self.word2id:
                self.word2id[word] = len(self.word2id)
                self.id2word[len(self.id2word)] = word

            if tag not in self.tag2id:
                self.tag2id[tag] = len(self.tag2id)
                self.id2tag[len(self.id2tag)] = tag
        print(self.tag2id)

        # 初始化参数
        self.M = len(self.word2id)
        self.N = len(self.tag2id)

        self.pi = np.zeros(self.N)
        self.A = np.zeros((self.N,self.N))
        self.B = np.zeros((self.N,self.M))

        print(self.M)
        print(self.N)
    
    def train(self):
        # 统计参数
        pre_tag = ""
        for line in open(self.filename,encoding='utf-8'):
            if line == "\n": # 遇到空行跳过本次统计
                pre_tag = "" # 同时将 pre_tag 设为空
                continue

            items = [line[0],line[2:-1]]
            wordId, tagId = self.word2id[items[0]], self.tag2id[items[1].rstrip()]

            if pre_tag == "":
                self.pi[tagId] += 1
                self.B[tagId][wordId] += 1
            else:
                self.B[tagId][wordId] += 1
                self.A[self.tag2id[pre_tag]][tagId] += 1

            pre_tag = items[1].strip() # 为下个时刻记录本时刻的 pre_tag
        print('参数统计结果(A,pi)')
        print(f'A:{self.A}')
        print(f'pi:{self.pi}')

        #转化为概率的形式
        self.pi = self.pi/sum(self.pi)
        for i in range(self.N):
            self.A[i] /= sum(self.A[i])
            self.B[i] /= sum(self.B[i])
        
        print('训练参数结果(A,B,PI)')
        print(f'A:{self.A}')
        print(f'B:{self.B}')
        print(f'PI:{self.pi}')
        

    #概率连乘的前面加log变为加法
    def log(self,v):
        if v==0:
            return np.log(v+0.00001)
        return np.log(v)

    #维特比算法
    def predict(self,x):
        y = x
        x = [self.word2id[word] for word in x]
        T = len(x)

        dp = np.zeros((T,self.N))  # T 是序列长度，self.N 是状态总数
        ptr = np.zeros((T,self.N),dtype=int) # 存放下标

        for j in range(self.N):
            dp[0][j] = self.log(self.pi[j]) + self.log(self.B[j][x[0]])   # t = 1 时刻的得分单独计算

        # 从第二个时刻开始由上至下、由左至右地更新DP数组
        for i in range(1, T):
            for j in range(self.N):
                dp[i][j] = -99999
                for k in range(self.N):
                    score = dp[i-1][k] + self.log(self.A[k][j]) + self.log(self.B[j][x[i]])
                    if score > dp[i][j]:   # 如果得分高于先前，更新DP数组
                        dp[i][j] = score
                        ptr[i][j] = k  # 记录路径

        best_seq = [0]*T
        best_seq[T-1] = np.argmax(dp[T-1]) # 取最后时刻的DP数组中最大值的下标

        for i in range(T-2, -1, -1):# 从 T-1 遍历到 0 时刻
            best_seq[i] = ptr[i+1][best_seq[i+1]]

        ans=""
        for i in range(len(best_seq)):
            ans+=y[i]+self.id2tag[best_seq[i]]+'_|'
        print(ans)
        print()

In [19]:
#模型3训练
hmm_method3=HMM_BIO_method3('corpus/corpus_1.txt')

{'O': 0, 'B-ORG': 1, 'I-ORG': 2}
261
3


In [20]:
hmm_method3.train()

参数统计结果(A,pi)
A:[[664.  10.   0.]
 [  0.   0.  17.]
 [ 10.   0. 122.]]
pi:[88.  7.  0.]
训练参数结果(A,B,PI)
A:[[0.9851632  0.0148368  0.        ]
 [0.         0.         1.        ]
 [0.07575758 0.         0.92424242]]
B:[[0.00262467 0.0144357  0.00656168 0.00918635 0.00131234 0.00131234
  0.00262467 0.01049869 0.03149606 0.01312336 0.03149606 0.0183727
  0.03937008 0.00787402 0.05249344 0.00656168 0.00131234 0.0144357
  0.01574803 0.00131234 0.00262467 0.00131234 0.00131234 0.00131234
  0.00656168 0.00131234 0.0144357  0.00656168 0.00131234 0.00131234
  0.00131234 0.         0.00918635 0.00656168 0.00787402 0.00918635
  0.00787402 0.00787402 0.01049869 0.00787402 0.00787402 0.00131234
  0.00131234 0.00131234 0.00787402 0.00524934 0.00524934 0.01706037
  0.01706037 0.00524934 0.00524934 0.00656168 0.00393701 0.00393701
  0.00262467 0.00787402 0.00262467 0.00262467 0.00393701 0.00393701
  0.00131234 0.00131234 0.00262467 0.         0.         0.
  0.         0.00656168 0.         0.         0

In [21]:
#测试
words=["中国",'龙川县博物馆馆藏文物']
for word in words:
    hmm_method3.predict(word)

中O_|国B-ORG_|

龙B-ORG_|川I-ORG_|县I-ORG_|博I-ORG_|物I-ORG_|馆I-ORG_|馆I-ORG_|藏O_|文O_|物O_|

