In [3]:
import numpy as np
 
CUSTOM_SEED = 42
np.random.seed(CUSTOM_SEED)

In [4]:
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /Users/somya/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [5]:
from nltk.corpus import treebank
sentences = treebank.tagged_sents(tagset='universal')

In [6]:
import random
print(random.choice(sentences))

[(u'But', u'CONJ'), (u'it', u'PRON'), (u'is', u'VERB'), (u'Mr.', u'NOUN'), (u'Lane', u'NOUN'), (u',', u'.'), (u'as', u'ADP'), (u'movie', u'NOUN'), (u'director', u'NOUN'), (u',', u'.'), (u'producer', u'NOUN'), (u'and', u'CONJ'), (u'writer', u'NOUN'), (u',', u'.'), (u'who', u'PRON'), (u'*T*-65', u'X'), (u'has', u'VERB'), (u'been', u'VERB'), (u'obsessed', u'VERB'), (u'with', u'ADP'), (u'*', u'X'), (u'refitting', u'VERB'), (u'Chaplin', u'NOUN'), (u"'s", u'PRT'), (u'Little', u'NOUN'), (u'Tramp', u'NOUN'), (u'in', u'ADP'), (u'a', u'DET'), (u'contemporary', u'ADJ'), (u'way', u'NOUN'), (u'.', u'.')]


In [7]:
tags = set([tag for sentence in treebank.tagged_sents() for _, tag in sentence])
print('nb_tags: %sntags: %s' % (len(tags), tags))

nb_tags: 46ntags: set([u'PRP$', u'VBG', u'VBD', u'``', u'VBN', u'POS', u"''", u'VBP', u'WDT', u'JJ', u'WP', u'VBZ', u'DT', u'#', u'RP', u'$', u'NN', u'FW', u',', u'.', u'TO', u'PRP', u'RB', u'-LRB-', u':', u'NNS', u'NNP', u'VB', u'WRB', u'CC', u'LS', u'PDT', u'RBS', u'RBR', u'CD', u'-NONE-', u'EX', u'IN', u'WP$', u'MD', u'NNPS', u'-RRB-', u'JJS', u'JJR', u'SYM', u'UH'])


In [9]:
print sentences[0]

[(u'Pierre', u'NOUN'), (u'Vinken', u'NOUN'), (u',', u'.'), (u'61', u'NUM'), (u'years', u'NOUN'), (u'old', u'ADJ'), (u',', u'.'), (u'will', u'VERB'), (u'join', u'VERB'), (u'the', u'DET'), (u'board', u'NOUN'), (u'as', u'ADP'), (u'a', u'DET'), (u'nonexecutive', u'ADJ'), (u'director', u'NOUN'), (u'Nov.', u'NOUN'), (u'29', u'NUM'), (u'.', u'.')]


In [8]:
train_test_cutoff = int(.80 * len(sentences)) 
training_sentences = sentences[:train_test_cutoff]
testing_sentences = sentences[train_test_cutoff:]
 
train_val_cutoff = int(.25 * len(training_sentences))
validation_sentences = training_sentences[:train_val_cutoff]
training_sentences = training_sentences[train_val_cutoff:]

In [None]:
print training_sentences

In [None]:
def add_basic_features(sentence_terms, index):
    """ Compute some very basic word features.
 
        :param sentence_terms: [w1, w2, ...] 
        :type sentence_terms: list
        :param index: the index of the word 
        :type index: int
        :return: dict containing features
        :rtype: dict
    """
    term = sentence_terms[index]
    return {
        'nb_terms': len(sentence_terms),
        'term': term,
        'is_first': index == 0,
        'is_last': index == len(sentence_terms) - 1,
        'is_capitalized': term[0].upper() == term[0],
        'is_all_caps': term.upper() == term,
        'is_all_lower': term.lower() == term,
        'prefix-1': term[0],
        'prefix-2': term[:2],
        'prefix-3': term[:3],
        'suffix-1': term[-1],
        'suffix-2': term[-2:],
        'suffix-3': term[-3:],
        'prev_word': '' if index == 0 else sentence_terms[index - 1],
        'next_word': '' if index == len(sentence_terms) - 1 else sentence_terms[index + 1]
    }

In [None]:
def untag(tagged_sentence):
    """ 
    Remove the tag for each tagged term. 
 
    :param tagged_sentence: a POS tagged sentence
    :type tagged_sentence: list
    :return: a list of tags
    :rtype: list of strings
    """
    return [w for w, _ in tagged_sentence]
 
def transform_to_dataset(tagged_sentences):
    """
    Split tagged sentences to X and y datasets and append some basic features.
 
    :param tagged_sentences: a list of POS tagged sentences
    :param tagged_sentences: list of list of tuples (term_i, tag_i)
    :return: 
    """
    X, y = [], []
 
    for pos_tags in tagged_sentences:
        for index, (term, class_) in enumerate(pos_tags):
            # Add basic NLP features for each sentence term
            X.append(add_basic_features(untag(pos_tags), index))
            y.append(class_)
    return X, y

In [None]:
X_train, y_train = transform_to_dataset(training_sentences)
X_test, y_test = transform_to_dataset(testing_sentences)
X_val, y_val = transform_to_dataset(validation_sentences)

In [None]:
print X_train, y_train

In [None]:
from sklearn.feature_extraction import DictVectorizer
 
# Fit our DictVectorizer with our set of features
dict_vectorizer = DictVectorizer(sparse=False)
dict_vectorizer.fit(X_train + X_test + X_val)

In [None]:
# Convert dict features to vectors
X_train = dict_vectorizer.transform(X_train)
X_test = dict_vectorizer.transform(X_test)
X_val = dict_vectorizer.transform(X_val)

In [None]:
# Fit LabelEncoder with our list of classes
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(y_train + y_test + y_val)

In [None]:
# Encode class values as integers
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)
y_val = label_encoder.transform(y_val)

In [None]:
print X_train

In [None]:
print y_train