In [1]:
import pandas as pd
from pathlib import Path
import os
from sklearn.model_selection import train_test_split

In [2]:
train = Path(os.path.join('.','blackboard-treebank','thai10'))

In [3]:
# ใน train มี 7 พันไฟล์.  เอาที่เป็น ".xxx" ออก # โค้ดจาก https://colab.research.google.com/drive/1Ur-p2yQBf5JsUJbJJlZsm2VqNPY8gIkL
train_files = [f for f in sorted(train.iterdir()) if not f.name.startswith('.')]
len(train_files)

25

In [4]:
def readfile(path):
    _data = []
    with open(path,'r',encoding="utf-8-sig") as f:
        _temp = [i for i in f.read().split("\n\n") if i != '']
    for j in _temp:
        _temp_data = []
        for i in j.split("\n"):
            if i.startswith("#") == False:
                _t = i.strip().split("\t")
                _temp_data.append((_t[1],_t[3]))
        _data.extend(_temp_data)
    return _data

In [5]:
train_data_temp = []

In [6]:
for file in train_files:
    train_data_temp.append(readfile(file))

In [7]:
len(train_data_temp)

25

In [8]:
def make_cls(index):
    _data=[]
    tag=""
    j = len(index)
    i=0
    _temp=[]
    pre=None
    while i<j:
        if _temp==[]:
            pre=list(index[i])+["B_CLS"]
            _temp.append(pre)
        elif index[i]==('$$', '$$'):
            pre[-1]='E_CLS'
            _temp[-1]=pre
            _data.append(_temp)
            _temp=[]
        elif i+1==j:
            pre[-1]='E_CLS'
            _data.append(_temp)
            _temp=[]
        else:
            pre=list(index[i])+["I_CLS"]
            _temp.append(pre)
        i+=1
    return _data

In [9]:
train_data = []

for file in train_data_temp:
    train_data.extend(make_cls(file))

In [10]:
len(train_data)

91700

In [11]:
def doc2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]
    # Features from current word
    features={
        'word.curr_word': word,
        'word.curr_isspace':word.isspace(),
        'word.curr_postag':postag,
        'word.curr_isdigit': word.isdigit()
    }
    if i > 0:
        prevword = doc[i-1][0]
        postag1 = doc[i-1][1]
        features['word.prev_word'] = prevword
        features['word.prev_isspace']=prevword.isspace()
        features['word.prev_postag'] = postag1
        features['word.prev_isdigit'] = prevword.isdigit()
    else:
        features['BOS'] = True # Special "Beginning of Sequence" tag
    # Features from next word
    if i < len(doc)-1:
        nextword = doc[i+1][0]
        postag1 = doc[i+1][1]
        features['word.next_word'] = nextword
        features['word.next_isspace']=nextword.isspace()
        features['word.next_postag'] = postag1
        features['word.next_isdigit'] = nextword.isdigit()
    else:
        features['EOS'] = True # Special "End of Sequence" tag
    return features

def extract_features(doc):
    return [doc2features(doc, i) for i in range(len(doc))]

def get_labels(doc):
    return [tag for (token,postag,tag) in doc]

In [12]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
train_data,test_data=train_test_split(train_data,test_size=0.1,random_state=10)

In [14]:
X_data = [extract_features(doc) for doc in tqdm(train_data)]
y_data = [get_labels(doc) for doc in tqdm(train_data)]

100%|██████████| 82530/82530 [00:01<00:00, 68004.56it/s] 
100%|██████████| 82530/82530 [00:00<00:00, 510279.73it/s]


In [15]:
import pycrfsuite

In [16]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_data, y_data):
    try:
      trainer.append(xseq, yseq)
    except:
      print(xseq, yseq)

CPU times: user 3.67 s, sys: 79.9 ms, total: 3.75 s
Wall time: 3.75 s


In [17]:
trainer.set_params({
    'c1': 0.1,   # coefficient for L1 penalty
    'c2': 0.1,  # coefficient for L2 penalty
    'max_iterations': 500,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [18]:
%%time
trainer.train('blackboard-cls-v1.1.crfsuite')

CPU times: user 11.6 s, sys: 13.4 ms, total: 11.6 s
Wall time: 11.6 s


In [19]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from itertools import chain
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [20]:
X_test = [extract_features(doc) for doc in tqdm(test_data)]
y_test = [get_labels(doc) for doc in tqdm(test_data)]

  0%|          | 0/9170 [00:00<?, ?it/s]

100%|██████████| 9170/9170 [00:00<00:00, 148802.08it/s]
100%|██████████| 9170/9170 [00:00<00:00, 636310.16it/s]


In [21]:
tagger = pycrfsuite.Tagger()
tagger.open('blackboard-cls-v1.1.crfsuite')

<contextlib.closing at 0x76447941ef90>

In [22]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 172 ms, sys: 938 μs, total: 172 ms
Wall time: 172 ms


In [23]:
[i['word.curr_word'] for i in X_test[-20]+X_test[-19]]

['แต่',
 'บังเอิญ',
 'ว่า',
 'ช่วง',
 'ที่',
 'ใช้',
 'หอก',
 'ทมิฬ',
 'แทง',
 'ทมิฬ',
 'และ',
 'เชื่อ',
 'ว่า',
 'ปัจจุบัน',
 'ฮิซโบเลาะห์',
 'น่า',
 'จะ',
 'มี',
 'ขีปนาวุธ']

In [24]:
X_test[0]

[{'word.curr_word': 'ว่า',
  'word.curr_isspace': False,
  'word.curr_postag': 'CC',
  'word.curr_isdigit': False,
  'BOS': True,
  'word.next_word': 'ไม่',
  'word.next_isspace': False,
  'word.next_postag': 'NG',
  'word.next_isdigit': False},
 {'word.curr_word': 'ไม่',
  'word.curr_isspace': False,
  'word.curr_postag': 'NG',
  'word.curr_isdigit': False,
  'word.prev_word': 'ว่า',
  'word.prev_isspace': False,
  'word.prev_postag': 'CC',
  'word.prev_isdigit': False,
  'word.next_word': 'ติดใจ',
  'word.next_isspace': False,
  'word.next_postag': 'VV',
  'word.next_isdigit': False},
 {'word.curr_word': 'ติดใจ',
  'word.curr_isspace': False,
  'word.curr_postag': 'VV',
  'word.curr_isdigit': False,
  'word.prev_word': 'ไม่',
  'word.prev_isspace': False,
  'word.prev_postag': 'NG',
  'word.prev_isdigit': False,
  'EOS': True}]

In [25]:
tagger.tag(X_test[-20]+X_test[-19])

['B_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'E_CLS',
 'B_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'I_CLS',
 'E_CLS']

In [26]:
labels = list(tagger.info().labels.keys())

In [27]:
from sklearn.metrics import f1_score

In [28]:
print(bio_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       B_CLS       1.00      1.00      1.00      9170
       E_CLS       1.00      1.00      1.00      9170
       I_CLS       1.00      1.00      1.00     70427

   micro avg       1.00      1.00      1.00     88767
   macro avg       1.00      1.00      1.00     88767
weighted avg       1.00      1.00      1.00     88767
 samples avg       1.00      1.00      1.00     88767

