## Xử lý bài toán POS Tagging với thuật toán Conditional Random Fields - CRFs

In [19]:
  import nltk

Đọc và định dạng file đầu vào

In [20]:
def read_file(filename):
  with open(filename, 'r', encoding='utf-8') as train_doc:
    lines = train_doc.readlines()

    doc_for_train = []
    for line in lines:
      if line != "\n":
        doc_for_train.append([])
        word_tag = line.split()

        for w in word_tag:
          w = w.split("/")
          if w[0] != '\u2026':
            doc_for_train[-1].append(tuple(w))

  return doc_for_train


train_set = read_file('vi_train.pos')
test_set = read_file('vi_test.pos')
# train_set[4]

In ra độ dài của file train và file test

In [21]:
print("Number of Sentences in Training Data ",len(train_set))
print("Number of Sentences in Testing Data ",len(test_set))

Number of Sentences in Training Data  8445
Number of Sentences in Testing Data  1051


Tạo features cho bộ dữ liệu

In [22]:
import re
def features(sentence,index):
    ### sentence is of the form [w1,w2,w3,..], index is the position of the word in the sentence
    return {
        # 'is_first_capital':int(sentence[index][0].isupper()),
        'is_first_word': int(index==0),
        'is_last_word':int(index==len(sentence)-1),
        'is_complete_capital': int(sentence[index].upper()==sentence[index]),
        'prev_word':'' if index==0 else sentence[index-1],
        'next_word':'' if index==len(sentence)-1 else sentence[index+1],
        'is_numeric':int(sentence[index].isdigit()),
        'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',sentence[index])))),
        # 'prefix_1':sentence[index][0],
        'prefix_2': sentence[index][:2],
        'prefix_3':sentence[index][:3],
        'prefix_4':sentence[index][:4],
        # 'suffix_1':sentence[index][-1],
        'suffix_2':sentence[index][-2:],
        'suffix_3':sentence[index][-3:],
        'suffix_4':sentence[index][-4:],
        'word_has_hyphen': 1 if '-' in sentence[index] else 0  
         }
def untag(sentence):
    return [word for word,tag in sentence]
 
 
def prepareData(tagged_sentences):
    X,y=[],[]
    for sentences in tagged_sentences:
        X.append([features(untag(sentences), index) for index in range(len(sentences))])
        y.append([tag for word,tag in sentences])
    return X,y
X_train,y_train=prepareData(train_set)
X_test,y_test=prepareData(test_set)

Xây dựng mô hình CRFs

In [23]:
! pip install sklearn_crfsuite
from sklearn_crfsuite import CRF



In [24]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.01, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

Kết quả chạy mô hình

In [25]:
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers

y_pred=crf.predict(X_test)
print("F1 score on Test Data ")
print(metrics.flat_accuracy_score(y_test, y_pred))

y_pred_train=crf.predict(X_train)
print("F score on Training Data ")
print(metrics.flat_accuracy_score(y_train, y_pred_train))

F1 score on Test Data 
0.9331350892098556
F score on Training Data 
0.9748369058713886


Kết quả rất tốt:
  ~97% trên dữ liệu train
  ~93% trên dữ liệu test

Tiến hành gán nhãn câu đầu vào: "Dù khá đắt nhưng tôi vẫn đồng ý."

In [26]:
! pip install pyvi



In [27]:
from pyvi import ViTokenizer
c = 'Dù khá đắt nhưng tôi vẫn đồng_ý.'

def pos_tag(sentence):
  sentence_features = [features(sentence, index) for index in range(len(sentence))]
  return list(zip(sentence, crf.predict([sentence_features])[0]))

result = pos_tag(ViTokenizer.tokenize("Dù khá đắt nhưng tôi vẫn đồng ý.").split())

print(result)

[('Dù', 'C'), ('khá', 'R'), ('đắt', 'A'), ('nhưng', 'C'), ('tôi', 'P'), ('vẫn', 'R'), ('đồng_ý', 'V'), ('.', '.')]
