In [63]:
import nltk

tagged_sentences = nltk.corpus.treebank.tagged_sents()

print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(nltk.corpus.treebank.tagged_words()))

Tagged sentences:  3914
Tagged words: 100676


In [64]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

In [65]:
sents = ['This', 'is', 'a', 'sentence']
index = 3

In [66]:
features(sents,index)

{'word': 'sentence',
 'is_first': False,
 'is_last': True,
 'is_capitalized': False,
 'is_all_caps': False,
 'is_all_lower': True,
 'prefix-1': 's',
 'prefix-2': 'se',
 'prefix-3': 'sen',
 'suffix-1': 'e',
 'suffix-2': 'ce',
 'suffix-3': 'nce',
 'prev_word': 'a',
 'next_word': '',
 'has_hyphen': False,
 'is_numeric': False,
 'capitals_inside': False}

In [67]:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

In [68]:
# Split the dataset for training and testing
cutoff = int(.75 * len(tagged_sentences))

cutoff

2935

In [69]:
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]

In [70]:
print(len(training_sentences))
print(len(test_sentences))

2935
979


In [71]:
def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
         for index in range(len(tagged)):
                X.append(features(untag(tagged), index))
                y.append(tagged[index][1])
    return X, y
         
X_train,y_train = transform_to_dataset(training_sentences)

X_test, y_test = transform_to_dataset(test_sentences)

In [73]:
# Building Model
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])

clf.fit(X[:10000], y[:10000])   # Use only the first 10K samples if you're running it multiple times. It takes a fair bit :)

print('Training completed')

print("Accuracy:", clf.score(X_test, y_test))

Training completed
Accuracy: 0.893941828699984


In [95]:
# Validation
def pos_tag(sentence):
    tags = clf.predict([features(sentence, index) for index in range(len(sentence))])
    print(sentence)
    print(tags)
    return zip(sentence, tags)

In [96]:
print(pos_tag(nltk.word_tokenize('This is my friend, John.')))

['This', 'is', 'my', 'friend', ',', 'John', '.']
['DT' 'VBZ' 'NN' 'NN' ',' 'NNP' '.']
<zip object at 0x000001935A7A6688>
