In [None]:
# import statements
import spacy
import pandas as pd
from pycrfsuite import Trainer, Tagger
from metrics import f1score
from sklearn.metrics import classification_report
from collections import Counter
from itertools import chain

In [None]:
# run the following command in the terminal if you are running the code for the first time after installing spacy
# python3 -m spacy download en_core_web_sm

# create a spacy object to use for pos-tagging
nlp = spacy.load("en_core_web_sm")

In [None]:
# function to read csv files which contains python output literals
def read_data(fname):
    data = pd.read_csv(fname, 
                       sep=' ',
                       header=None,
                       names=['a', 'b', 'c'],
                       encoding="utf-8",
                       converters={'a': pd.eval, 
                                   'b': pd.eval})

    # generate pos-tags for each of the words in the text
    pos_tags = [[token.pos_ for token in nlp(' '.join(s))] for s in data['a']]
    # label indicates the Name-Entity (BIOES)
    labels = [[l.split('-')[0] for l in labels] for labels in data['b']]

    # return a list of list of tuples, i.e. (word, pos-tag, label)
    return [[(w, p, l) for (w, p, l) in zip(words, pos, lbls)] for (words, pos, lbls) in zip(data['a'], pos_tags, labels)]

In [None]:
# convert words to features for the crf
def word2features(sent, i):
    # each element of sent is a tuple (word, pos-tag, label)
    # returns a list of 3 elements (features for current word, previous word and next word)
    word = sent[i][0]
    postag = sent[i][1]

    # features:
    #     1. is the word in lowercase
    #     2. last 3 characters of the word (for identifying -ing forms)
    #     3. last 2 characters of the word (for identifying -ed forms)
    #     4. is the word in uppercase
    #     5. is the word a title
    #     6. is the word a digit
    #     7. postag of the word
    #     8. postag till the last two characters

    features = [
        'bias',
        f'word.lower={word.lower()}',
        f'word[-3:]={word[-3:]}',
        f'word[-2:]={word[-2:]}',
        f'word.isupper={word.isupper()}',
        f'word.istitle={word.istitle()}',
        f'word.isdigit={word.isdigit()}',
        f'postag={postag}',
        f'postag[:2]={postag[:2]}'
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            f'-1:word.lower={word1.lower()}',
            f'-1:word.istitle={word1.istitle()}',
            f'-1:word.isupper={word1.isupper()}',
            f'-1:postag={postag1}',
            f'-1:postag[:2]={postag1[:2]}'
        ])
    else:
        # if the word is the first word, the feature is BOS
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            f'+1:word.lower={word1.lower()}',
            f'+1:word.istitle={word1.istitle()}',
            f'+1:word.isupper={word1.isupper()}',
            f'+1:postag={postag1}',
            f'+1:postag[:2]={postag1[:2]}',
        ])
    else:
        # if the feature is the last word, the feature is EOS
        features.append('EOS')
                
    return features

# convert each sentence to wordwise-features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# extract all labels from the sentences
def sent2labels(sent):
    return [label for (_, _, label) in sent]

In [None]:
# a function to visualize the classification model
# omits the 'O's while computing the metrics as they are too many and distort the true-results
# true-results: metrics related to features only (BIES)
def _classification_report(y_true, y_pred):
    tagset = {'B': 0, 'I': 1, 'O': 2, 'E': 3, 'S': 4}
    # convert the the character labels to integers for computing the metrics
    y_true = [[tagset[i] for i in y] for y in y_true]
    y_pred = [[tagset[i] for i in yp] for yp in y_pred]
    
    r, p, f1 = f1score(y_true, y_pred)
    print(f"featurewise_f1_score: {f1:.4f}")
    print(f"featurewise_recall_score: {r:.4f}")
    print(f"featurewise_precision_score: {p:.4f}\n\n")
    
    # flatten all predictions into a single list for computing the metrics
    y_true = list(chain.from_iterable(y_true))
    y_pred = list(chain.from_iterable(y_pred))
    
    return classification_report(y_true,
                                 y_pred,
                                 zero_division=0,
                                 labels=[0, 1, 3, 4],
                                 target_names=['B', 'I', 'E', 'S'])

In [None]:
# read all data from the files
train_sents = read_data("../data/train_290818.txt")
test_sents = read_data("../data/test_290818.txt")

In [None]:
# convert the sentences to features
X_train = [sent2features(s) for s in train_sents]
X_test = [sent2features(s) for s in test_sents]

# extract the labels from the sentences
y_train = [sent2labels(s) for s in train_sents]
y_test = [sent2labels(s) for s in train_sents]

In [None]:
# python-crfsuite object
trainer = Trainer(verbose=False)

# add the training data to the crf model
for batch in zip(X_train, y_train):
    trainer.append(*batch)

In [None]:
# hyperparameters for the model
trainer.set_params({
    'feature.minfreq': 1,
    'num_memories': 6,
    'linesearch': 'StrongBacktracking',
    'max_linesearch': 20,
    'c1': 1e-1,
    'c2': 1e-1,
    'max_iterations': 2048, 
    'feature.possible_transitions': True,
    'feature.possible_states': True,
})


# train the model and save the weights in the file '290818.crfsuite'
trainer.train('290818.crfsuite')

# print the final training iteration metrics for visualization
print(trainer.logparser.last_iteration)

## Make predictions

In [None]:
# create an inference model from the saved weights
tagger = Tagger()
tagger.open('290818.crfsuite')

# get the predictions for the test data
y_pred = [tagger.tag(xseq) for xseq in X_test]
y_test = [sent2labels(xseq) for xseq in test_sents]

# visualize the classification metrics
print(_classification_report(y_test, y_pred))