In [1]:
import pandas as pd
from pathlib import Path
import os

In [2]:
train = Path(os.path.join('.','blackboard-treebank','thai10'))

In [3]:
# ใน train มี 7 พันไฟล์.  เอาที่เป็น ".xxx" ออก # โค้ดจาก https://colab.research.google.com/drive/1Ur-p2yQBf5JsUJbJJlZsm2VqNPY8gIkL
train_files = [f for f in train.iterdir() if not f.name.startswith('.')]
len(train_files)

25

In [6]:
def readfile(path):
    _data = []
    with open(path,'r',encoding="utf-8-sig") as f:
        _temp = [i for i in f.read().split("\n\n") if i != '']
    for j in _temp:
        _temp_data = []
        for i in j.split("\n"):
            if i.startswith("#") == False:
                _t = i.strip().split("\t")
                _temp_data.append((_t[1],_t[3]))
        _data.extend(_temp_data)
    return _data

In [26]:
train_data_temp = []

In [27]:
for file in train_files:
    train_data_temp.append(readfile(file))

In [32]:
len(train_data_temp)

25

In [29]:
def make_cls(index):
    _data=[]
    tag=""
    j = len(index)
    i=0
    _temp=[]
    pre=None
    while i<j:
        if _temp==[]:
            pre=list(index[i])+["B_CLS"]
            _temp.append(pre)
        elif index[i]==('$$', '$$'):
            pre[-1]='E_CLS'
            _temp[-1]=pre
            _data.append(_temp)
            _temp=[]
        elif i+1==j:
            pre[-1]='E_CLS'
            _data.append(_temp)
            _temp=[]
        else:
            pre=list(index[i])+["I_CLS"]
            _temp.append(pre)
        i+=1
    return _data

In [30]:
train_data = []

for file in train_data_temp:
    train_data.extend(make_cls(file))

In [31]:
len(train_data)

91700

In [33]:
def doc2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]
    # Features from current word
    features={
        'word.curr_word': word,
        'word.curr_isspace':word.isspace(),
        'word.curr_postag':postag,
        'word.curr_isdigit': word.isdigit()
    }
    if i > 0:
        prevword = doc[i-1][0]
        postag1 = doc[i-1][1]
        features['word.prev_word'] = prevword
        features['word.prev_isspace']=prevword.isspace()
        features['word.prev_postag'] = postag1
        features['word.prev_isdigit'] = prevword.isdigit()
    else:
        features['BOS'] = True # Special "Beginning of Sequence" tag
    # Features from next word
    if i < len(doc)-1:
        nextword = doc[i+1][0]
        postag1 = doc[i+1][1]
        features['word.next_word'] = nextword
        features['word.next_isspace']=nextword.isspace()
        features['word.next_postag'] = postag1
        features['word.next_isdigit'] = nextword.isdigit()
    else:
        features['EOS'] = True # Special "End of Sequence" tag
    return features

def extract_features(doc):
    return [doc2features(doc, i) for i in range(len(doc))]

def get_labels(doc):
    return [tag for (token,postag,tag) in doc]

In [34]:
from tqdm.auto import tqdm

In [35]:
X_data = [extract_features(doc) for doc in tqdm(train_data)]
y_data = [get_labels(doc) for doc in tqdm(train_data)]

  0%|          | 0/91700 [00:00<?, ?it/s]

  0%|          | 0/91700 [00:00<?, ?it/s]

In [36]:
import pycrfsuite

In [37]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_data, y_data):
    try:
      trainer.append(xseq, yseq)
    except:
      print(xseq, yseq)

CPU times: user 10.9 s, sys: 320 ms, total: 11.2 s
Wall time: 11.2 s


In [38]:
trainer.set_params({
    'c1': 0.1,   # coefficient for L1 penalty
    'c2': 0.1,  # coefficient for L2 penalty
    'max_iterations': 500,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [39]:
%%time
trainer.train('blackboard-cls.crfsuite')

CPU times: user 1min 4s, sys: 105 ms, total: 1min 4s
Wall time: 1min 4s


In [40]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from itertools import chain
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [41]:
X_test = [extract_features(doc) for doc in tqdm(train_data)]
y_test = [get_labels(doc) for doc in tqdm(train_data)]

  0%|          | 0/91700 [00:00<?, ?it/s]

  0%|          | 0/91700 [00:00<?, ?it/s]

In [42]:
tagger = pycrfsuite.Tagger()
tagger.open('blackboard-cls.crfsuite')

<contextlib.closing at 0x7f305db42c70>

In [43]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 7.53 s, sys: 120 ms, total: 7.65 s
Wall time: 7.65 s


In [44]:
tagger.tag(X_test[0])

['B_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'E_CLS']

In [45]:
labels = list(tagger.info().labels.keys())

In [46]:
from sklearn.metrics import f1_score

In [47]:
print(bio_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       B_CLS       1.00      1.00      1.00     91698
       E_CLS       1.00      1.00      1.00     91700
       I_CLS       1.00      1.00      1.00    707795

   micro avg       1.00      1.00      1.00    891193
   macro avg       1.00      1.00      1.00    891193
weighted avg       1.00      1.00      1.00    891193
 samples avg       1.00      1.00      1.00    891193

