In [4]:
import pandas as pd 
import numpy as np
import nltk
import random
from sklearn.svm import SVC
from sklearn.feature_extraction import DictVectorizer

In [5]:
import pandas as pd
vi_train = pd.read_table("vi_train.pos", header = None )
vi_train.columns = ['train_tagged_words']
vi_train_tagged_words = [ tuple(tag_words.split('/')) for pair in vi_train['train_tagged_words'] for tag_words in pair.split() ]
len(vi_train_tagged_words)


195795

In [6]:
# Proprocessing of language
tagged_words_processing = []
for tag_words in vi_train_tagged_words:
    #  add tag_words have length =2 tagged_words_processing
    if len(tag_words) == 2:
        tagged_words_processing.append(tag_words)
len(tagged_words_processing)

195517

In [7]:
# 53 is length 3
# 227 is length <= 1 
# 195515 is length = 2 


In [8]:
# tags, vocalb in corpus have unique
tags = {tag for word,tag in tagged_words_processing}
vocab = {word for word,tag in tagged_words_processing}


In [25]:
# caculate emision probability
def word_given_tag(word, tag, train_bag = tagged_words_processing):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
     # count tags
    word_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    #now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(word_given_tag_list)
    return (count_w_given_tag, count_tag)

# caculate transition probability
def transition_probability(t2, t1, train_bag = tagged_words_processing):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)
trainsion_matrix_table = np.zeros((len(tags), len(tags)), dtype='float32')
for row, t1 in enumerate(list(tags)):
    for column, t2 in enumerate(list(tags)): 
        trainsion_matrix_table[row, column] = transition_probability(t2, t1)[0]/transition_probability(t2, t1)[1]
 
print(trainsion_matrix_table)

[[1.9011119e-01 1.5696534e-04 8.1883585e-03 ... 0.0000000e+00
  0.0000000e+00 1.7423153e-02]
 [8.4745765e-02 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
  0.0000000e+00 0.0000000e+00]
 [7.2941177e-02 0.0000000e+00 2.7450981e-02 ... 0.0000000e+00
  0.0000000e+00 8.6274510e-03]
 ...
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
  0.0000000e+00 0.0000000e+00]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
  0.0000000e+00 0.0000000e+00]
 [2.4104235e-01 1.3029316e-04 4.2996742e-03 ... 0.0000000e+00
  0.0000000e+00 1.3811075e-02]]


In [35]:
trainsion_matrix_table = pd.DataFrame(trainsion_matrix_table, columns = list(tags), index=list(tags))
# display(trainsion_matrix_table)
# store in trainsion_matrix_table 
trainsion_matrix_table.loc['-','Nc']


0.051262435

In [38]:
# viterbiAlgorithm is dynamic programming
# It will be find the best probability each node to create setences
def viterbiAlgorithm(words, train_bag = tagged_words_processing):
    state = []
    tag_corpus = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in tag_corpus:
            if key == 0:
                transition_probability = trainsion_matrix_table.loc['.', tag]
            else:
      
                transition_probability = trainsion_matrix_table.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            emission_probability = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            
            state_probability = emission_probability * transition_probability  
          
            p.append(state_probability)
        # getting state for which probability is maximum
        pmax = max(p)
        state_max = tag_corpus[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [39]:
# Read file test
vi_test = pd.read_table("vi_test.pos", header = None )
vi_test.columns = ['test_tag_words']
 
vi_test_tagged_words = [pair.split() for pair in vi_test['test_tag_words']]
test_set = []
for index in vi_test_tagged_words:
    data = [tuple(i.split('/')) for i  in index]
    test_set.append(data)



In [40]:
random.seed(1234)      #define a random seed to get same sentences when run multiple times
 
# choose random 10 numbers
rndom = [random.randint(1,len(test_set)) for x in range(10)]

test_run = [test_set[i] for i in rndom]
# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]
# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]

tagged_seq = viterbiAlgorithm(test_tagged_words)
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
 
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

Viterbi Algorithm Accuracy:  94.33198380566802


In [41]:
test_sent="Dù khá đắt nhưng tôi vẫn đồng ý ."
pred_tags_rule=viterbiAlgorithm(test_sent.split())
print(pred_tags_rule)

[('Dù', 'C'), ('khá', 'R'), ('đắt', 'A'), ('nhưng', 'C'), ('tôi', 'P'), ('vẫn', 'R'), ('đồng', 'N'), ('ý', 'N'), ('.', '.')]
