In [1]:
import nltk
tagged_sentences = nltk.corpus.treebank.tagged_sents()
 
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(nltk.corpus.treebank.tagged_words()))

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
Tagged sentences:  3914
Tagged words: 100676


### Feature selection and preprocesstion

In [2]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }
 
import pprint 
pprint.pprint(features(['This', 'is', 'a', 'sentence'], 2))

{'capitals_inside': False,
 'has_hyphen': False,
 'is_all_caps': False,
 'is_all_lower': True,
 'is_capitalized': False,
 'is_first': False,
 'is_last': False,
 'is_numeric': False,
 'next_word': 'sentence',
 'prefix-1': 'a',
 'prefix-2': 'a',
 'prefix-3': 'a',
 'prev_word': 'is',
 'suffix-1': 'a',
 'suffix-2': 'a',
 'suffix-3': 'a',
 'word': 'a'}


In [3]:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

In [4]:
cutoff = int(.75 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]
 
print(len(training_sentences))   # 2935
print(len(test_sentences))         # 979
 
def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
 
    return X, y
 
X, y = transform_to_dataset(training_sentences)

2935
979


In [5]:
print(type(X))
print(len(X))

<class 'list'>
75784


In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
 
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])
 
clf.fit(X[:20000], y[:20000])   # Use only the first 10K samples if you're running it multiple times. It takes a fair bit :)
 
X_test, y_test = transform_to_dataset(test_sentences)
 
print("Accuracy:", clf.score(X_test, y_test))

Accuracy: 0.9165997107504419


## Prediction testing

In [9]:
X[0]

{'word': 'Pierre',
 'is_first': True,
 'is_last': False,
 'is_capitalized': True,
 'is_all_caps': False,
 'is_all_lower': False,
 'prefix-1': 'P',
 'prefix-2': 'Pi',
 'prefix-3': 'Pie',
 'suffix-1': 'e',
 'suffix-2': 're',
 'suffix-3': 'rre',
 'prev_word': '',
 'next_word': 'Vinken',
 'has_hyphen': False,
 'is_numeric': False,
 'capitals_inside': False}

In [10]:
y[0]

'NNP'

In [11]:
clf.predict(X[0])

array(['NNP'], dtype='<U6')

In [12]:
def process_test_sentence(sentence):
    X = []
 
    for s in sentence:
        for index in range(len(s)):
            X.append(features(s, index))

    return X

In [19]:
test_sentence = process_test_sentence([["this", "is", "a", "cat"]])
test_sentence

[{'word': 'this',
  'is_first': True,
  'is_last': False,
  'is_capitalized': False,
  'is_all_caps': False,
  'is_all_lower': True,
  'prefix-1': 't',
  'prefix-2': 'th',
  'prefix-3': 'thi',
  'suffix-1': 's',
  'suffix-2': 'is',
  'suffix-3': 'his',
  'prev_word': '',
  'next_word': 'is',
  'has_hyphen': False,
  'is_numeric': False,
  'capitals_inside': False},
 {'word': 'is',
  'is_first': False,
  'is_last': False,
  'is_capitalized': False,
  'is_all_caps': False,
  'is_all_lower': True,
  'prefix-1': 'i',
  'prefix-2': 'is',
  'prefix-3': 'is',
  'suffix-1': 's',
  'suffix-2': 'is',
  'suffix-3': 'is',
  'prev_word': 'this',
  'next_word': 'a',
  'has_hyphen': False,
  'is_numeric': False,
  'capitals_inside': False},
 {'word': 'a',
  'is_first': False,
  'is_last': False,
  'is_capitalized': False,
  'is_all_caps': False,
  'is_all_lower': True,
  'prefix-1': 'a',
  'prefix-2': 'a',
  'prefix-3': 'a',
  'suffix-1': 'a',
  'suffix-2': 'a',
  'suffix-3': 'a',
  'prev_word': 'is'

In [21]:
clf.predict(test_sentence[0])

array(['VBZ'], dtype='<U6')

# Predict 

In [37]:
def predict(test_sentence):
    X = []
    words = test_sentence.split(" ")
    X.append(words)
    
    test_sentence = process_test_sentence(X)
    
    for x in test_sentence:
        print(x['word'], clf.predict(x))

In [38]:
predict("this is a cat")

this ['DT']
is ['VBZ']
a ['DT']
cat ['NN']
