In [None]:
import nltk
import pprint 
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

Import data which is tagged

In [9]:
# tagged_sentences = nltk.corpus.brown.tagged_sents()
tagged_sentences = nltk.corpus.treebank.tagged_sents()
 
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(nltk.corpus.brown.tagged_words()))

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
Tagged sentences:  3914
Tagged words: 1161192


Training our own POS Tagger using scikit-learn

Before starting training a classifier, we must agree first on what features to use. Most obvious choices are: the word itself, the word before and the word after. That’s a good start, but we can do so much better. For example, the 2-letter suffix is a great indicator of past-tense verbs, ending in “-ed”. 3-letter suffix helps recognize the present participle ending in “-ing”.

In [10]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }
 

pprint.pprint(features(['This', 'is', 'a', 'sentence'], 2))
 
{'capitals_inside': False,
 'has_hyphen': False,
 'is_all_caps': False,
 'is_all_lower': True,
 'is_capitalized': False,
 'is_first': False,
 'is_last': False,
 'is_numeric': False,
 'next_word': 'sentence',
 'prefix-1': 'a',
 'prefix-2': 'a',
 'prefix-3': 'a',
 'prev_word': 'is',
 'suffix-1': 'a',
 'suffix-2': 'a',
 'suffix-3': 'a',
 'word': 'a'}
 

{'capitals_inside': False,
 'has_hyphen': False,
 'is_all_caps': False,
 'is_all_lower': True,
 'is_capitalized': False,
 'is_first': False,
 'is_last': False,
 'is_numeric': False,
 'next_word': 'sentence',
 'prefix-1': 'a',
 'prefix-2': 'a',
 'prefix-3': 'a',
 'prev_word': 'is',
 'suffix-1': 'a',
 'suffix-2': 'a',
 'suffix-3': 'a',
 'word': 'a'}


{'capitals_inside': False,
 'has_hyphen': False,
 'is_all_caps': False,
 'is_all_lower': True,
 'is_capitalized': False,
 'is_first': False,
 'is_last': False,
 'is_numeric': False,
 'next_word': 'sentence',
 'prefix-1': 'a',
 'prefix-2': 'a',
 'prefix-3': 'a',
 'prev_word': 'is',
 'suffix-1': 'a',
 'suffix-2': 'a',
 'suffix-3': 'a',
 'word': 'a'}

Small helper function to strip the tags from our tagged corpus and feed it to our classifier:

In [11]:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

Let’s now build our training set. Our classifier should accept features for a single word, but our corpus is composed of sentences. We’ll need to do some transformations:

In [12]:
# Split the dataset for training and testing
cutoff = int(.75 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]
 
print(len(training_sentences) )  # 2935
print(len(test_sentences) )        # 979
 
def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
 
    return X, y
 
X, y = transform_to_dataset(training_sentences)

2935
979


We’re now ready to train the classifier. I’ve opted for a DecisionTreeClassifier. Feel free to play with others:

In [14]:
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])
 
clf.fit(X[:10000], y[:10000])   # Use only the first 10K samples if you're running it multiple times. It takes a fair bit :)
 
print('Training completed')
 
X_test, y_test = transform_to_dataset(test_sentences)
 
print("Accuracy:", clf.score(X_test, y_test))

Training completed
Accuracy: 0.897959183673


Let’s tag!

We can now use our classifier like this:

In [20]:
X_test

[{'capitals_inside': False,
  'has_hyphen': False,
  'is_all_caps': False,
  'is_all_lower': False,
  'is_capitalized': True,
  'is_first': True,
  'is_last': False,
  'is_numeric': False,
  'next_word': 'can',
  'prefix-1': 'W',
  'prefix-2': 'We',
  'prefix-3': 'We',
  'prev_word': '',
  'suffix-1': 'e',
  'suffix-2': 'We',
  'suffix-3': 'We',
  'word': 'We'},
 {'capitals_inside': False,
  'has_hyphen': False,
  'is_all_caps': False,
  'is_all_lower': True,
  'is_capitalized': False,
  'is_first': False,
  'is_last': False,
  'is_numeric': False,
  'next_word': 'understand',
  'prefix-1': 'c',
  'prefix-2': 'ca',
  'prefix-3': 'can',
  'prev_word': 'We',
  'suffix-1': 'n',
  'suffix-2': 'an',
  'suffix-3': 'can',
  'word': 'can'},
 {'capitals_inside': False,
  'has_hyphen': False,
  'is_all_caps': False,
  'is_all_lower': True,
  'is_capitalized': False,
  'is_first': False,
  'is_last': False,
  'is_numeric': False,
  'next_word': 'and',
  'prefix-1': 'u',
  'prefix-2': 'un',
  'pre

In [17]:
def pos_tag(sentence):
    tagged_sentence = []
    tags = clf.predict([features(sentence, index) for index in range(len(sentence))])
    return zip(sentence, tags)
print(list(pos_tag(nltk.word_tokenize('This is my friend, John.'))))
 

[('This', 'DT'), ('is', 'VBZ'), ('my', 'NN'), ('friend', 'NN'), (',', ','), ('John', 'NNP'), ('.', '.')]
