## Method1: Use NLTK pos_tag() directly
modified from: https://www.nltk.org/book/ch05.html

In [1]:
import nltk
#nltk.download('averaged_perceptron_tagger')
text = nltk.word_tokenize("Social Security number , passport number and details about the services provided for the payment")
nltk.pos_tag(text)

[('Social', 'NNP'),
 ('Security', 'NNP'),
 ('number', 'NN'),
 (',', ','),
 ('passport', 'JJ'),
 ('number', 'NN'),
 ('and', 'CC'),
 ('details', 'NNS'),
 ('about', 'IN'),
 ('the', 'DT'),
 ('services', 'NNS'),
 ('provided', 'VBD'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('payment', 'NN')]

In [2]:
text = nltk.word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)

[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

Notice that refuse and permit both appear as a present tense verb (VBP) and a noun (NN). E.g. refUSE is a verb meaning "deny," while REFuse is a noun meaning "trash" (i.e. they are not homophones). 

In [3]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
#nltk.download('brown')
text.similar('woman')

man time day year car moment world house family child country boy
state job place way war girl work word


In [4]:
text.similar('bought')

made said done put had seen found given left heard was been brought
set got that took in told felt


In [5]:
text.similar('summer')

time year way day country point other one man church world case
morning moment city act state night school week


In [17]:
nltk.word_tokenize('hello world')

['hello', 'world']

In [18]:
nltk.pos_tag(['hello', 'world'])

[('hello', 'NN'), ('world', 'NN')]

### Method2: Implement pos-tagging from scratch

In [7]:
tag2id, id2tag = {}, {}  # maps tag to id . tag2id: {"VB": 0, "NNP":1,..} , id2tag: {0: "VB", 1: "NNP"....}
word2id, id2word = {}, {} # maps word to id

for line in open('traindata.txt'):
    items = line.split('/')
    word, tag = items[0], items[1].rstrip()  # 抽取每一行里的单词和词性
    
    if word not in word2id:
        word2id[word] = len(word2id)
        id2word[len(id2word)] = word
    if tag not in tag2id:
        tag2id[tag] = len(tag2id)
        id2tag[len(id2tag)] = tag

M = len(word2id)  # M: 词典的大小、# of words in dictionary
N = len(tag2id)   # N: 词性的种类个数  # of tags in tag set

In [19]:
print (M)
print (N)
print (tag2id)
print ('\n')
print (id2tag)

18978
54
{'NNP': 0, ',': 1, 'VBG': 2, 'TO': 3, 'VB': 4, 'NN': 5, 'IN': 6, 'JJ': 7, 'VBD': 8, 'NNS': 9, 'CD': 10, 'CC': 11, 'PRP': 12, 'MD': 13, 'DT': 14, '.': 15, 'VBZ': 16, 'VBN': 17, 'WDT': 18, 'VBP': 19, 'POS': 20, 'RB': 21, '$': 22, 'PRP$': 23, ':': 24, 'JJR': 25, '``': 26, "''": 27, 'WP': 28, 'JJS': 29, 'WRB': 30, 'RBR': 31, 'NNPS': 32, 'RP': 33, 'WP$': 34, 'EX': 35, '(': 36, ')': 37, 'PDT': 38, 'RBS': 39, 'FW': 40, 'UH': 41, 'SYM': 42, 'LS': 43, '#': 44, 'VBG|NN': 45, 'JJ|NN': 46, 'RB|IN': 47, 'NNS|NN': 48, 'VBN|JJ': 49, 'VB|NN': 50, 'RBR|JJR': 51, 'NN|NNS': 52, 'JJ|RB': 53}


{0: 'NNP', 1: ',', 2: 'VBG', 3: 'TO', 4: 'VB', 5: 'NN', 6: 'IN', 7: 'JJ', 8: 'VBD', 9: 'NNS', 10: 'CD', 11: 'CC', 12: 'PRP', 13: 'MD', 14: 'DT', 15: '.', 16: 'VBZ', 17: 'VBN', 18: 'WDT', 19: 'VBP', 20: 'POS', 21: 'RB', 22: '$', 23: 'PRP$', 24: ':', 25: 'JJR', 26: '``', 27: "''", 28: 'WP', 29: 'JJS', 30: 'WRB', 31: 'RBR', 32: 'NNPS', 33: 'RP', 34: 'WP$', 35: 'EX', 36: '(', 37: ')', 38: 'PDT', 39: 'RBS', 40: 

In [20]:
# 构建 pi, A, B
import numpy as np
pi = np.zeros(N)   # 每个词性出现在句子中第一个位置的概率,  N: # of tags  pi[i]: tag i出现在句子中第一个位置的概率
A = np.zeros((N, M)) # A[i][j]: 给定tag i, 出现单词j的概率。 N: # of tags M: # of words in dictionary
B = np.zeros((N,N))  # B[i][j]: 之前的状态是i, 之后转换成转态j的概率 N: # of tags
print (A)
print (B)
print (pi)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]


In [21]:
prev_tag = ""
for line in open('traindata.txt'):
    items = line.split('/')
    wordId, tagId = word2id[items[0]], tag2id[items[1].rstrip()]
    if prev_tag == "":  # 这意味着是句子的开始
        pi[tagId] += 1
        A[tagId][wordId] += 1
    else:  # 如果不是句子的开头
        A[tagId][wordId] += 1
        B[tag2id[prev_tag]][tagId] += 1
    
    if items[0] == ".":
        prev_tag = ""
    else:
        prev_tag = items[1].rstrip()

# normalize
pi = pi/sum(pi)
for i in range(N):
    A[i] /= sum(A[i])
    B[i] /= sum(B[i])

#  到此为止计算完了模型的所有的参数： pi, A, B

In [22]:
def log(v):
    if v == 0:
        return np.log(v+0.000001)
    return np.log(v)

In [23]:
def viterbi(x, pi, A, B):
    """
    x: user input string/sentence: x: "I like playing soccer"
    pi: initial probability of tags
    A: 给定tag, 每个单词出现的概率
    B: tag之间的转移概率
    """
    x = [word2id[word] for word in x.split(" ")]  # x: [4521, 412, 542 ..]
    T = len(x)
    
    dp = np.zeros((T,N))  # dp[i][j]: w1...wi, 假设wi的tag是第j个tag
    ptr = np.array([[0 for x in range(N)] for y in range(T)] ) # T*N
    # TODO: ptr = np.zeros((T,N), dtype=int)
    
    for j in range(N): # basecase for DP算法
        dp[0][j] = log(pi[j]) + log(A[j][x[0]])
    
    for i in range(1,T): # 每个单词
        for j in range(N):  # 每个词性
            # TODO: 以下几行代码可以写成一行（vectorize的操作， 会使得效率变高）
            dp[i][j] = -9999999
            for k in range(N): # 从每一个k可以到达j
                score = dp[i-1][k] + log(B[k][j]) + log(A[j][x[i]])
                if score > dp[i][j]:
                    dp[i][j] = score
                    ptr[i][j] = k
    
    # decoding: 把最好的tag sequence 打印出来
    best_seq = [0]*T  # best_seq = [1,5,2,23,4,...]  
    # step1: 找出对应于最后一个单词的词性
    best_seq[T-1] = np.argmax(dp[T-1])
    
    # step2: 通过从后到前的循环来依次求出每个单词的词性
    for i in range(T-2, -1, -1): # T-2, T-1,... 1, 0
        best_seq[i] = ptr[i+1][best_seq[i+1]]
        
    # 到目前为止, best_seq存放了对应于x的 词性序列
    for i in range(len(best_seq)):
        print (id2tag[best_seq[i]])
    

In [24]:
x = "Social Security number , passport number and details about the services provided for the payment"
viterbi(x, pi, A, B)



NNP
NNP
NN
,
NN
NN
CC
NNS
IN
DT
NNS
VBN
IN
DT
NN
