In [1]:
import nltk
from nltk.corpus import treebank
from sklearn.model_selection import train_test_split
import numpy as np
import time
import random
from sklearn_crfsuite import CRF, metrics
from sklearn_crfsuite.metrics import flat_accuracy_score


In [2]:
#Importing the corpus, simplifying tags and preprocessing the data
tagged_sentences = treebank.tagged_sents()[:1000] 

def simplify_tag(tag):
    if tag.startswith('N'): return "NOUN"
    elif tag.startswith('V'): return "VERB"
    elif tag.startswith('J'): return "ADJ"
    else: return "OTHER"

def preprocess_sentence(tagged_sentence):
    words, tags = zip(*tagged_sentence)
    words = [word.lower() for word in words]
    tags = [simplify_tag(tag) for tag in tags]
    return words, tags


In [3]:
sentences, labels = zip(*[preprocess_sentence(sentence) for sentence in tagged_sentences])

#Extracting additional features
def word_features(sentence, index):
    features = {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'prev_prev_word': '' if index == 0 or index == 1 else sentence[index - 2],
        'next_next_word': '' if index >= len(sentence) - 2 else sentence[index + 2],
    }
    return features

In [4]:
#Splittingg the dataset 
X = [[word_features(s, i) for i in range(len(s))] for s in sentences]
y = labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [5]:
#Solving the CRF Problem
start_time = time.time()
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)
end_time = time.time()

print("Training time: ", end_time - start_time, "sec")


Training time:  2.3529932498931885 sec


In [9]:
#Calculating Accuracy
start_time = time.time()
y_pred = crf.predict(X_test)
end_time = time.time()

print("Testing time: ", end_time - start_time, "sec")


accuracy = flat_accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)


Testing time:  0.11095976829528809 sec
Accuracy:  0.9508093336136221


In [7]:
#Printing example sentences
for _ in range(3):
    example_index = random.randint(0, len(X_test)-1)
    print("Sentence: ", [x['word'] for x in X_test[example_index]])
    print("True labels: ", y_test[example_index])
    print("Predicted labels: ", y_pred[example_index])
    print("\n")


Sentence:  ['on', 'the', 'back', ',', 'the', 'shirts', 'read', ',', '``', 'we', 'have', 'all', 'the', 'answers', '.', "''"]
True labels:  ['OTHER', 'OTHER', 'NOUN', 'OTHER', 'OTHER', 'NOUN', 'VERB', 'OTHER', 'OTHER', 'OTHER', 'VERB', 'OTHER', 'OTHER', 'NOUN', 'OTHER', 'OTHER']
Predicted labels:  ['OTHER', 'OTHER', 'NOUN', 'OTHER', 'OTHER', 'NOUN', 'NOUN', 'OTHER', 'OTHER', 'OTHER', 'VERB', 'OTHER', 'OTHER', 'NOUN', 'OTHER', 'OTHER']


Sentence:  ['``', 'it', "'s", 'an', 'odd', 'thing', '0', '*', 'to', 'put', '*t*-1', 'on', 'the', 'list', ',', "''", 'mr.', 'bretz', 'noted', '*t*-2', '.']
True labels:  ['OTHER', 'OTHER', 'VERB', 'OTHER', 'ADJ', 'NOUN', 'OTHER', 'OTHER', 'OTHER', 'VERB', 'OTHER', 'OTHER', 'OTHER', 'NOUN', 'OTHER', 'OTHER', 'NOUN', 'NOUN', 'VERB', 'OTHER', 'OTHER']
Predicted labels:  ['OTHER', 'OTHER', 'VERB', 'OTHER', 'ADJ', 'NOUN', 'OTHER', 'OTHER', 'OTHER', 'VERB', 'OTHER', 'OTHER', 'OTHER', 'NOUN', 'OTHER', 'OTHER', 'NOUN', 'NOUN', 'VERB', 'OTHER', 'OTHER']


Sentence: