In [22]:
import numpy as np
import nltk
from nltk import DefaultTagger as df
from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt
from nltk import TrigramTagger as tg

# load the tagged sentences of treebank data
treebank_tagged_sentences = nltk.corpus.treebank.tagged_sents()

# Split the dataset for training and testing
cutoff = int(.80 * len(treebank_tagged_sentences))
treebank_train_sents = treebank_tagged_sentences[:cutoff]
treebank_test_sents = treebank_tagged_sentences[cutoff:]
print len(treebank_train_sents)
print len(treebank_test_sents)

# load the tagged sentences of brown data
brown_tagged_sentences = nltk.corpus.brown.tagged_sents()[:len(treebank_tagged_sentences)]

# Split the dataset for training and testing
cutoff = int(.80 * len(brown_tagged_sentences))
brown_train_sents = brown_tagged_sentences[:cutoff]
brown_test_sents = brown_tagged_sentences[cutoff:]
print len(brown_train_sents)
print len(brown_test_sents)


3131
783
3131
783


In [23]:
#function to strip the tags from our tagged corpus and feed it to our classifier
def untag(tagged_sentence):
    return [w for w,t in tagged_sentence]

#define what features to use
def features(sentence, index):
    """ sentence:[w1, w2, ...], index: th index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

#build the training set by a trasformation operation
def transform_to_dataset(tagged_sentences):
    X, y = [], []
    """ X stores the features state of each word in target sentences set, y stores the coresponding tags """
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
 
    return X, y

X_treebank_train, y_treebank_train = transform_to_dataset(treebank_train_sents)
X_brown_train,y_brown_train = transform_to_dataset(brown_train_sents) 

In [24]:
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
 
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', GaussianNB())
])

X_treebank_test, y_treebank_test = transform_to_dataset(treebank_test_sents)
X_brown_test, y_brown_test = transform_to_dataset(brown_test_sents)

print('training ...')
clf.fit(X_treebank_train[:10000], y_treebank_train[:10000])  
print('Done')
gaussian_treebank = clf.score(X_treebank_test, y_treebank_test)
print "Accuracy of treebank data:", gaussian_treebank

print('training ...')
clf.fit(X_brown_train[:10000], y_brown_train[:10000])
print('Done')
gaussian_brown = clf.score(X_brown_test, y_brown_test)
print "Accuracy of brown data:", gaussian_brown

training ...
Done
Accuracy of treebank data: 0.852088427566
training ...
Done
Accuracy of brown data: 0.709691629956


In [32]:
#Pretrained Pos tagger model accuracy
Pretrained_tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')

pre_tree = Pretrained_tagger.evaluate(treebank_tagged_sentences)
print pre_tree
pre_brown = Pretrained_tagger.evaluate(brown_tagged_sentences)
print pre_brown

0.995689141404
0.570638842267


In [26]:
##Rule-based POS taggers
patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'),               
             (r'.*s$', 'NNS'), (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]

#Training model for treebank
def_model_t = nltk.DefaultTagger('NN')
uni_model_t = nltk.UnigramTagger(treebank_train_sents)
bi_model_t = nltk.BigramTagger(treebank_train_sents)
tri_model_t = nltk.TrigramTagger(treebank_train_sents)
regexp_model_t = nltk.RegexpTagger(patterns)

print "Performance of treebank"
# performance of Default Tagger
model3_1_t = def_model_t.evaluate(treebank_test_sents)
print(model3_1_t)
print()
# performance of Unigram Tagger
model3_2_t = uni_model_t.evaluate(treebank_test_sents)
print(model3_2_t)
print()
# performance of Bigram Tagger
model3_3_t = bi_model_t.evaluate(treebank_test_sents)
print(model3_3_t)
print()
# performance of Trigram Tagger
model3_4_t = tri_model_t.evaluate(treebank_test_sents)
print(model3_4_t)
print()
# performance of Regex Tagger
model3_5_t = regexp_model_t.evaluate(treebank_test_sents)
print(model3_5_t)
print()




Performance of treebank
0.144767702979
()
0.862617895105
()
0.113329008434
()
0.0670692150307
()
0.24232746145
()


In [27]:
#Training model for brown
def_model_b = nltk.DefaultTagger('NN')
uni_model_b = nltk.UnigramTagger(brown_train_sents)
bi_model_b = nltk.BigramTagger(brown_train_sents)
tri_model_b = nltk.TrigramTagger(brown_train_sents)
regexp_model_b = nltk.RegexpTagger(patterns)

print "Performance of brown"
# performance of Default Tagger
model3_1_b = def_model_b.evaluate(brown_test_sents)
print(model3_1_b)
print()
# performance of Unigram Tagger
model3_2_b = uni_model_b.evaluate(brown_test_sents)
print(model3_2_b)
print()
# performance of Bigram Tagger
model3_3_b = bi_model_b.evaluate(brown_test_sents)
print(model3_3_b)
print()
# performance of Trigram Tagger
model3_4_b = tri_model_b.evaluate(brown_test_sents)
print(model3_4_b)
print()
# performance of Regex Tagger
model3_5_b = regexp_model_b.evaluate(brown_test_sents)
print(model3_5_b)
print()


Performance of brown
0.127422907489
()
0.791960352423
()
0.0883259911894
()
0.0574339207048
()
0.184691629956
()


In [28]:
import plotly
plotly.__version__

'2.2.1'

In [33]:
import plotly.plotly as py
import plotly.graph_objs as go

#Plot the result for model1 and model2 performanced in treebank and brown data
trace1 = go.Bar(
            x=['Model1: My Pos tagger', 'Model2: Pretrained Pos tagger'],
            y=[gaussian_treebank, pre_tree],
            name = 'TreeBank'
    )
trace2 = go.Bar(
            x=['Model1: My Pos tagger', 'Model2: Pretrained Pos tagger'],
            y=[gaussian_brown, pre_brown],
            name = 'Brown'
)

data = [trace1, trace2]
layout = go.Layout(
    title='Performance of model1&2 in Treebank and Brown',
    barmode = 'group'
)

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename='Performance of Treebank and Brown on model1&2 ')

In [34]:
trace1 = go.Bar(
            x=['Model3.1', 'Model3.2','Model3.3','Model3.4','Model3.5'],
            y=[model3_1_t, model3_2_t,model3_3_t,model3_4_t,model3_5_t],
            name = 'TreeBank'
    )
trace2 = go.Bar(
            x=['Model3.1', 'Model3.2','Model3.3','Model3.4','Model3.5'],
            y=[model3_1_b, model3_2_b,model3_3_b,model3_4_b,model3_5_b],
            name = 'Brown'
)

data = [trace1, trace2]
layout = go.Layout(
    title='Performance of Rule-based model in Treebank and Brown',
    barmode = 'group'
)

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename='Performance of model3 ')