In [1]:
import pandas as pd
from pythainlp.tokenize import word_tokenize
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# value , POST TAG , LABEL
train_data = [
    [("นาย", "XX", "O"), ("สมชาย", "XX", "B-PER"), ("รัก", "XX", "O"), ("ประเทศไทย", "XX", "B-LOC")],
    [("เขา", "XX", "O"), ("ทำงาน", "XX", "O"), ("ที่", "XX", "O"), ("บริษัท", "XX", "B-ORG"), ("กูเกิล", "XX", "I-ORG")],
    [("ปรีชา", "XX", "B-PER"), ("ไป", "XX", "O"), ("เชียงใหม่", "XX", "B-LOC"), ("กับ", "XX", "O"), ("มานี", "XX", "B-PER")],
    [("ร้าน", "XX", "B-ORG"), ("เซเว่น", "XX", "I-ORG"), ("มี", "XX", "O"), ("สาขา", "XX", "O"), ("เยอะ", "XX", "O")]
]

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    
    features = {
        'bias': 1.0,
        'word': word,               
        'is_digit': word.isdigit(), 
    }
    
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word': word1,       
            '-1:is_digit': word1.isdigit(),
        })
    else:
        features['BOS'] = True     

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word': word1,        
        })
    else:
        features['EOS'] = True     # EOF

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

# แปลงข้อมูลเป็น Feature
X_train = [sent2features(s) for s in train_data]
y_train = [sent2labels(s) for s in train_data]

In [3]:
# model crf
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1, # L1 regularization
    c2=0.1, # L2 regularization
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X_train, y_train)

from collections import Counter
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(5))

Top likely transitions:
B-ORG  -> I-ORG   2.047731
O      -> B-LOC   0.969252
O      -> B-PER   0.695244
B-PER  -> O       0.627982
O      -> O       0.244944


In [4]:
def predict_ner(text):
    tokens = word_tokenize(text, engine="newmm")
    
    simulated_sent = [(t, "XX", "O") for t in tokens]
    
    features = sent2features(simulated_sent)
    
    prediction = crf.predict([features])[0]
    
    result = list(zip(tokens, prediction))
    return result

new_text = "นายสมชายจะไปเที่ยวเชียงใหม่กับบริษัทกูเกิล"
print(predict_ner(new_text))

[('นาย', 'O'), ('สม', 'B-PER'), ('ชาย', 'O'), ('จะ', 'O'), ('ไปเที่ยว', 'O'), ('เชียงใหม่', 'B-LOC'), ('กับ', 'O'), ('บริษัท', 'B-ORG'), ('กูเกิล', 'I-ORG')]
