In [23]:
import string

import nltk
from nltk.corpus import brown
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [24]:
# nltk.download('brown')
# nltk.download('universal_tagset')

In [25]:
tagged_sentences = brown.tagged_sents(tagset='universal')

sentences = []
tags = []
for sent in tagged_sentences:
    sentence, tag = zip(*sent)
    sentences.append(list(sentence))
    tags.append(list(tag))

In [26]:
def word2features(sent, i):
    word = sent[i]
    features = {
        'word': word.lower(),
        'is_capitalized': word[0].isupper(),
        'is_digit': word.isdigit(),
        'suffix-1': word[-1:],
        'suffix-2': word[-2:],
        'suffix-3': word[-3:],
    }
    return features


def extract_features(sentences):
    features = []
    for sent in sentences:
        sentence_features = [word2features(sent, i) for i in range(len(sent))]
        features.append(sentence_features)
    return features

In [27]:
X = extract_features(sentences)
y = tags

vectorizer = DictVectorizer(sparse=True)
X_flat = [feat for sentence in X for feat in sentence]
y_flat = [tag for sentence_tags in y for tag in sentence_tags]

X_vec = vectorizer.fit_transform(X_flat)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y_flat, test_size=0.2, random_state=42)

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           .       1.00      1.00      1.00     29719
         ADJ       0.80      0.84      0.82     16729
         ADP       0.93      0.88      0.91     29131
         ADV       0.81      0.84      0.83     11301
        CONJ       0.94      0.99      0.96      7618
         DET       0.97      0.98      0.98     27273
        NOUN       0.93      0.88      0.91     54909
         NUM       0.85      0.94      0.89      2964
        PRON       0.98      0.93      0.95      9825
         PRT       0.64      0.91      0.75      5991
        VERB       0.91      0.91      0.91     36503
           X       0.67      0.12      0.21       276

    accuracy                           0.91    232239
   macro avg       0.87      0.85      0.84    232239
weighted avg       0.92      0.91      0.92    232239



In [28]:
def predict_pos(sentence):
    sentence_features = [word2features(sentence, i) for i in range(len(sentence))]
    X_sentence = vectorizer.transform(sentence_features)
    y_pred = nb_model.predict(X_sentence)
    return list(zip(sentence, y_pred))


def clear_sentence(sentence):
    punc = r'!()-[]{};:\'",<>./?@#$%^&*_~'
    for character in punc:
        sentence = sentence.replace(character, ' ' + character)
    return sentence

In [29]:
sentence = "SAm is our teacher, he gives us funny lessons"
sentence = clear_sentence(sentence)
sentence_list = sentence.split()
print(predict_pos(sentence_list))


[('SAm', np.str_('NOUN')), ('is', np.str_('VERB')), ('our', np.str_('DET')), ('teacher', np.str_('NOUN')), (',', np.str_('.')), ('he', np.str_('PRON')), ('gives', np.str_('VERB')), ('us', np.str_('PRON')), ('funny', np.str_('ADJ')), ('lessons', np.str_('NOUN'))]
