In [None]:
import spacy
import pandas as pd
from pycrfsuite import Trainer, Tagger
from metrics import f1score
from sklearn.metrics import classification_report
from collections import Counter
from itertools import chain
from joblib import delayed, Parallel
from multiprocessing import cpu_count

In [None]:
n_jobs = cpu_count()
# run the following command in the terminal if you are running the code for the first time after installing spacy
# python3 -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [None]:
def read_data(fname):
    data = pd.read_csv(fname, 
                       sep=' ',
                       header=None,
                       names=['a', 'b', 'c'],
                       encoding="utf-8",
                       converters={'a': pd.eval, 
                                   'b': pd.eval})

    pos_tags = [[token.pos_ for token in nlp(' '.join(s))] for s in data['a']]
    labels = [[l.split('-')[0] for l in labels] for labels in data['b']]

    return [[(w, p, l) for (w, p, l) in zip(words, pos, lbls)] for (words, pos, lbls) in zip(data['a'], pos_tags, labels)]

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        f'word.lower={word.lower()}',
        f'word[-3:]={word[-3:]}',
        f'word[-2:]={word[-2:]}',
        f'word.isupper={word.isupper()}',
        f'word.istitle={word.istitle()}',
        f'word.isdigit={word.isdigit()}',
        f'postag={postag}',
        f'postag[:2]={postag[:2]}'
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            f'-1:word.lower={word1.lower()}',
            f'-1:word.istitle={word1.istitle()}',
            f'-1:word.isupper={word1.isupper()}',
            f'-1:postag={postag1}',
            f'-1:postag[:2]={postag1[:2]}'
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            f'+1:word.lower={word1.lower()}',
            f'+1:word.istitle={word1.istitle()}',
            f'+1:word.isupper={word1.isupper()}',
            f'+1:postag={postag1}',
            f'+1:postag[:2]={postag1[:2]}',
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for (_, _, label) in sent]

def sent2tokens(sent):
    return [token for (token, _, _) in sent]

In [None]:
def _classification_report(y_true, y_pred):
    tagset = {'B': 0, 'I': 1, 'O': 2, 'E': 3, 'S': 4}
    y_true = [[tagset[i] for i in y] for y in y_true]
    y_pred = [[tagset[i] for i in yp] for yp in y_pred]
    
    r, p, f1 = f1score(y_true, y_pred)
    print(f"featurewise_f1_score: {f1:.4f}")
    print(f"featurewise_recall_score: {r:.4f}")
    print(f"featurewise_precision_score: {p:.4f}\n\n")
    
    y_true = list(chain.from_iterable(y_true))
    y_pred = list(chain.from_iterable(y_pred))
    
    return classification_report(y_true,
                                 y_pred,
                                 zero_division=0,
                                 labels=[0, 1, 3, 4],
                                 target_names=['B', 'I', 'E', 'S'])

In [None]:
train_sents = read_data("../data/train_290818.txt")
test_sents = read_data("../data/test_290818.txt")

In [None]:
X_train = Parallel(n_jobs=n_jobs)(delayed(sent2features)(s) for s in train_sents)
X_test = Parallel(n_jobs=n_jobs)(delayed(sent2features)(s) for s in test_sents)

y_train = Parallel(n_jobs=n_jobs)(delayed(sent2labels)(s) for s in train_sents)
y_test = Parallel(n_jobs=n_jobs)(delayed(sent2labels)(s) for s in train_sents)

In [None]:
trainer = Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [None]:
trainer.set_params({
    'feature.minfreq': 1,
    'num_memories': 6,
    'linesearch': 'StrongBacktracking',
    'max_linesearch': 20,
    'c1': 1e-1,
    'c2': 1e-1,
    'max_iterations': 2048, 
    'feature.possible_transitions': True,
    'feature.possible_states': True,
})

trainer.train('290818.crfsuite')

In [None]:
print(trainer.logparser.last_iteration)

## Make predictions

In [None]:
tagger = Tagger()
tagger.open('290818.crfsuite')

In [None]:
y_pred = [tagger.tag(xseq) for xseq in X_test]
y_test = [sent2labels(xseq) for xseq in test_sents]

In [None]:
print(_classification_report(y_test, y_pred))