In [1]:
import nltk
import spacy
import pandas as pd
from pycrfsuite import Trainer, Tagger
from metrics import f1score
from sklearn.metrics import classification_report
from collections import Counter
from itertools import chain
from joblib import delayed, Parallel
from multiprocessing import cpu_count

# uncomment the below line if you are running the code for the first time after installing nltk
# nltk.download('all')

In [2]:
n_jobs = cpu_count()
# run the following command in the terminal if you are running the code for the first time after installing spacy
# python3 -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [3]:
def read_data(fname):
    data = pd.read_csv(fname, 
                       sep=' ',
                       header=None,
                       names=['a', 'b', 'c'],
                       encoding="utf-8",
                       converters={'a': pd.eval, 
                                   'b': pd.eval})

    pos_tags = [[token.pos_ for token in nlp(' '.join(s))] for s in data['a']]
    labels = [[l.split('-')[0] for l in labels] for labels in data['b']]

    return [[(w, p, l) for (w, p, l) in zip(words, pos, lbls)] for (words, pos, lbls) in zip(data['a'], pos_tags, labels)]

In [4]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        f'word.lower={word.lower()}',
        f'word[-3:]={word[-3:]}',
        f'word[-2:]={word[-2:]}',
        f'word.isupper={word.isupper()}',
        f'word.istitle={word.istitle()}',
        f'word.isdigit={word.isdigit()}',
        f'postag={postag}',
        f'postag[:2]={postag[:2]}'
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            f'-1:word.lower={word1.lower()}',
            f'-1:word.istitle={word1.istitle()}',
            f'-1:word.isupper={word1.isupper()}',
            f'-1:postag={postag1}',
            f'-1:postag[:2]={postag1[:2]}'
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            f'+1:word.lower={word1.lower()}',
            f'+1:word.istitle={word1.istitle()}',
            f'+1:word.isupper={word1.isupper()}',
            f'+1:postag={postag1}',
            f'+1:postag[:2]={postag1[:2]}',
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for (_, _, label) in sent]

def sent2tokens(sent):
    return [token for (token, _, _) in sent]

In [5]:
def bio_classification_report(y_true, y_pred):
    tagset = {'B': 0, 'I': 1, 'O': 2, 'E': 3, 'S': 4}
    y_true = [[tagset[i] for i in y] for y in y_true]
    y_pred = [[tagset[i] for i in yp] for yp in y_pred]
    
    r, p, f1 = f1score(y_true, y_pred)
    print(f"featurewise_f1_score: {f1:.4f}")
    print(f"featurewise_recall_score: {r:.4f}")
    print(f"featurewise_precision_score: {p:.4f}\n\n")
    
    y_true = list(chain.from_iterable(y_true))
    y_pred = list(chain.from_iterable(y_pred))
    
    return classification_report(y_true,
                                 y_pred,
                                 zero_division=0,
                                 labels=[0, 1, 3, 4],
                                 target_names=['B', 'I', 'E', 'S'])

In [6]:
train_sents = read_data("../data/train_290818.txt")
test_sents = read_data("../data/test_290818.txt")

In [7]:
X_train = Parallel(n_jobs=n_jobs)(delayed(sent2features)(s) for s in train_sents)
X_test = Parallel(n_jobs=n_jobs)(delayed(sent2features)(s) for s in test_sents)

y_train = Parallel(n_jobs=n_jobs)(delayed(sent2labels)(s) for s in train_sents)
y_test = Parallel(n_jobs=n_jobs)(delayed(sent2labels)(s) for s in train_sents)

In [8]:
trainer = Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [9]:
trainer.set_params({
    'feature.minfreq': 1,
    'num_memories': 6,
    'linesearch': 'StrongBacktracking',
    'max_linesearch': 20,
    'c1': 1e-1,
    'c2': 1e-1,
    'max_iterations': 2048, 
    'feature.possible_transitions': True,
    'feature.possible_states': True,
})

trainer.train('290818.crfsuite')

In [10]:
print(trainer.logparser.last_iteration)

{'num': 802, 'scores': {}, 'loss': 3568.840346, 'feature_norm': 82.593766, 'error_norm': 6.889488, 'active_features': 11566, 'linesearch_trials': 2, 'linesearch_step': 0.5, 'time': 0.033}


## Make predictions

In [11]:
tagger = Tagger()
tagger.open('290818.crfsuite')

<contextlib.closing at 0x18662df2830>

In [12]:
y_pred = [tagger.tag(xseq) for xseq in X_test]
y_test = [sent2labels(xseq) for xseq in test_sents]

In [13]:
print(bio_classification_report(y_test, y_pred))

featurewise_f1_score: 0.3667
featurewise_recall_score: 0.2933
featurewise_precision_score: 0.4889


              precision    recall  f1-score   support

           B       0.51      0.40      0.45       311
           I       0.37      0.24      0.30        90
           E       0.52      0.41      0.46       309
           S       0.57      0.22      0.32       291

   micro avg       0.51      0.34      0.41      1001
   macro avg       0.49      0.32      0.38      1001
weighted avg       0.52      0.34      0.40      1001



## Check what the model has learnt

In [14]:
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.4f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

Top likely transitions:
B      -> I       8.065973
I      -> I       7.094853
B      -> E       6.752252
I      -> E       6.688789
O      -> B       2.698933
E      -> B       1.860267
O      -> S       1.761007
S      -> B       1.536394
O      -> O       1.388748
E      -> S       1.203925
S      -> S       1.019598
E      -> O       0.412944
S      -> O       -0.249607
S      -> E       -0.855203
B      -> S       -1.124432

Top unlikely transitions:
I      -> E       6.688789
O      -> B       2.698933
E      -> B       1.860267
O      -> S       1.761007
S      -> B       1.536394
O      -> O       1.388748
E      -> S       1.203925
S      -> S       1.019598
E      -> O       0.412944
S      -> O       -0.249607
S      -> E       -0.855203
B      -> S       -1.124432
I      -> O       -2.745981
O      -> E       -2.809948
B      -> O       -3.314997


In [15]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])

Top positive:
4.085014 S      word.lower=chrome
3.688637 S      word.lower=registration
3.601956 S      word.lower=callers
3.517643 S      word.lower=comment
3.427913 S      -1:word.lower=dd
3.391863 S      -1:word.lower=ondemand
3.315049 O      -1:word.lower=inventory
3.275435 O      word.lower=improvements
3.270184 O      word.lower=crash
3.194352 B      word.lower=signing
3.074182 S      word.lower=geofence
3.073090 E      +1:word.lower=literally
3.058250 S      word.lower=ads
3.029923 S      +1:word.lower=multipage
3.024327 S      word.lower=attachments
3.008348 S      word.lower=transporter
2.971403 S      -1:word.lower=remember
2.967165 E      +1:word.lower=bug
2.954166 E      -1:word.lower=signing
2.941855 S      word.lower=conversation

Top negative:
-1.933105 O      +1:word.lower=process
-1.949872 O      word.lower=contacts
-1.952957 O      word.lower=activity
-1.967230 O      +1:word.lower=mode
-2.022514 O      -1:word.lower=watch
-2.036207 O      -1:word.lower=remember
-2.05