In [6]:
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import brown
from nltk.corpus import treebank
from nltk import DefaultTagger as df
from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt
from nltk import TrigramTagger as tg
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
nltk.download('treebank') 

clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])

def features(sentence, index):
    return {
        'word': sentence[index],
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'prefix-1': sentence[index][0],
        'suffix-1': sentence[index][-1],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1]
    }

def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
 
    return X, y


def training (training_sentences):

            #this function considers each sentences in the training set, then saves the features of each word in each        //sentences in X,
    #and saves the tag of all words in all sentences into y. So, we have the features of all words appeard in training //set in x and all tags in y.
    X, y = transform_to_dataset(training_sentences)
    size=100
    return clf.fit(X[:size], y[:size])

    
    #print(list(pos_tag(word_tokenize('Hello world, lets do something awesome today!'))))


def pos_tag(sentence, classifier):
    print('checking...')
    tagged_sentence = []
    tags = classifier.predict([features(sentence, index) for index in range(len(sentence))])    
    return zip(sentence, tags)


[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\samin\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [12]:
x1 = treebank
x2 = brown 
#x3 = farsi

size = 100
patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'),               
             (r'.*s$', 'NNS'), (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]

annotated_sent1 = x1.tagged_sents()
cutoff1 = int(.75 * len(annotated_sent1)) #devide part of the corpus as training set and another part as test set
training_sentences1 = annotated_sent1[:cutoff1] 
test_sentences1 = annotated_sent1[cutoff1:]
X1,y1 = transform_to_dataset(test_sentences1)

model1 = training(training_sentences1) #clf_eng_treebank
model31 = nltk.DefaultTagger('NN')
model32 = nltk.UnigramTagger(train_sents)
model33 = nltk.BigramTagger(train_sents)
model34 = nltk.TrigramTagger(train_sents)
model35 = nltk.RegexpTagger(patterns)
performance11 = model1.score (X1,y1)
performance131 = model31.evaluate(test_sentences1)
performance132 = model32.evaluate(test_sentences1)
performance133 = model33.evaluate(test_sentences1)
performance134 = model34.evaluate(test_sentences1)
performance135 = model35.evaluate(test_sentences1)
print (performance131)
print (performance132)
print (performance133)
print (performance134)
print (performance135)

0.14325887835449141
0.29989554877068936
0.006990197653864695
0.005865338261288768
0.2421259842519685
