# Imports

In [91]:
import nltk
from nltk import word_tokenize
from nltk.corpus import treebank
import pprint
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Datasets download

In [92]:
nltk.download('treebank')
nltk.download('punkt')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Tagged sentences

In [93]:
tagged_sentences = treebank.tagged_sents()
print(tagged_sentences[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


# Feature Extraction

In [94]:
def extract_features(sentence, i):
    token = sentence[i]
    return {
        'token': token,
        'is_first': i == 0,
        'is_last': i == len(sentence) - 1,
        'is_capitalized': token[0].upper() == token[0],
        'is_all_caps': token.upper() == token,
        'is_all_lower': token.lower() == token,
        'is_numeric': token.isdigit(),
        'prefix-1': token[0],
        'prefix-2': token[:2],
        'prefix-3': token[:3], 
        'suffix-1': token[-1],
        'suffix-2': token[-2:],
        'suffix-3': token[-3:],
        'prev_word': '' if i == 0 else sentence[i - 1],
        'next_word': '' if i == len(sentence) - 1 else sentence[i + 1]
    }

In [95]:
X = []
Y = []

for t in tagged_sentences:
    for i in range(len(t)):
        X.append(extract_features([j[0] for j in t], i))
        Y.append(t[i][1])

In [96]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

# Model Training

In [97]:
classifier = Pipeline([('vectorizer', DictVectorizer(sparse=False)), ('classifier', DecisionTreeClassifier(criterion='entropy'))])

classifier.fit(X_train[:10000], Y_train[:10000])

Pipeline(steps=[('vectorizer', DictVectorizer(sparse=False)),
                ('classifier', DecisionTreeClassifier(criterion='entropy'))])

# Model Evaluation

In [98]:
print("accuracy", classifier.score(X_test, Y_test))

accuracy 0.8987882399682161


In [99]:
def pos_tag(tokens):
    tags = classifier.predict([extract_features(tokens, i) for i in range(len(tokens))])
    return zip(tokens, tags)

# Sample for output of your PoS tagger

In [100]:
print(list(pos_tag(word_tokenize('This is my friend, John.'))))

[('This', 'DT'), ('is', 'VBZ'), ('my', 'PRP$'), ('friend', 'NN'), (',', ','), ('John', 'NNP'), ('.', '.')]
