## Inbuilt PoS Tagging

In [1]:
import nltk
import re
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sejbp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sejbp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## English

In [2]:
# Tokenize the text
text = "I am going to the park with my friends tomorrow in the evening and we will play football together"
tokens = nltk.word_tokenize(text)

In [3]:
# PoS tagging
tagged_tokens = nltk.pos_tag(tokens)
tagged_tokens

[('I', 'PRP'),
 ('am', 'VBP'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('park', 'NN'),
 ('with', 'IN'),
 ('my', 'PRP$'),
 ('friends', 'NNS'),
 ('tomorrow', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('evening', 'NN'),
 ('and', 'CC'),
 ('we', 'PRP'),
 ('will', 'MD'),
 ('play', 'VB'),
 ('football', 'NN'),
 ('together', 'RB')]

In [4]:
nltk.download('indian')
from nltk.tag import tnt

[nltk_data] Downloading package indian to
[nltk_data]     C:\Users\sejbp\AppData\Roaming\nltk_data...
[nltk_data]   Package indian is already up-to-date!


## Hindi

In [5]:
# Tokenize the text
text = "ऑनलाइन शिक्षा में शिक्षक अपने छात्रों से संपर्क साधने के लिए स्काइप, ज़ूम आदि ऐप्प के माध्यम से जुड़ते है"
tokens = nltk.word_tokenize(text)

In [6]:
train_data = nltk.corpus.indian.tagged_sents('hindi.pos')
tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_data)

In [7]:
tagged_tokens = tnt_pos_tagger.tag(tokens)
tagged_tokens

[('ऑनलाइन', 'Unk'),
 ('शिक्षा', 'Unk'),
 ('में', 'PREP'),
 ('शिक्षक', 'Unk'),
 ('अपने', 'PRP'),
 ('छात्रों', 'Unk'),
 ('से', 'PREP'),
 ('संपर्क', 'Unk'),
 ('साधने', 'Unk'),
 ('के', 'PREP'),
 ('लिए', 'PREP'),
 ('स्काइप', 'Unk'),
 (',', 'PUNC'),
 ('ज़ूम', 'Unk'),
 ('आदि', 'RP'),
 ('ऐप्प', 'Unk'),
 ('के', 'PREP'),
 ('माध्यम', 'NN'),
 ('से', 'PREP'),
 ('जुड़ते', 'Unk'),
 ('है', 'VAUX')]

## Regular Expression based PoS Tagging

In [8]:
def pos_tag(sentence):
    tags = []
    words = sentence.split()
    for word in words:
        if re.match(r'^(I|me|you|he|him|she|her|it|we|us|they|them)$', word):  # Pronoun
            tags.append('PRP')
        elif re.match(r'^[A-Z][a-z]*$', word):  # Proper Noun
            tags.append('NNP')
        elif re.match(r'^to$', word):  # to
            tags.append('TO')
        elif re.match(r'^[A-Z][a-z]*$', word):  # Noun
            tags.append('NN')
        elif re.match(r'^(on|in|at|by|for|with|of|to)$', word):  # Preposition
            tags.append('IN')
        elif re.match(r'^(and|or|but|while|if|then)$', word):  # Conjunction
            tags.append('CC')
        elif re.match(r'^(a|an|the)$', word):  # Determiner
            tags.append('DT')
        elif re.match(r'^[a-z]+(ing)$', word):  # Verb (Gerund)
            tags.append('VBG')
        elif re.match(r'^[a-z]+$', word):  # Verb (Base Form)
            tags.append('VB')
        elif re.match(r'^[a-z]+ly$', word):  # Adverb
            tags.append('RB')
        elif re.match(r'^[a-z]+(ous|ful|ive|ish|able)$', word):  # Adjective
            tags.append('JJ')
        elif re.match(r'^[.,?!]$', word):  # Punctuation
            tags.append('PUNC')
        elif re.match(r'^[0-9]+$', word):  # Cardinal Number
            tags.append('CD')
        elif re.match(r'^[A-Z]+$', word):  # Foreign Word or Abbreviation
            tags.append('FW')
        else:
            tags.append('UNKNOWN')
    return list(zip(words, tags))

In [9]:
sentence = "I am going to the park with my friends tomorrow in the evening and we will play football together"
pos_tag_regex = pos_tag(sentence)
pos_tag_regex

[('I', 'PRP'),
 ('am', 'VB'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('park', 'VB'),
 ('with', 'IN'),
 ('my', 'VB'),
 ('friends', 'VB'),
 ('tomorrow', 'VB'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('evening', 'VBG'),
 ('and', 'CC'),
 ('we', 'PRP'),
 ('will', 'VB'),
 ('play', 'VB'),
 ('football', 'VB'),
 ('together', 'VB')]

## Dictionary-based PoS tagging

In [17]:
pos_dict = {
    "I": "PRP", "am": "VBP", "going": "VBG", "to": "TO",
    "the": "DT", "park": "NN", "with": "IN", "my": "PRP$",
    "friends": "NNS", "tomorrow": "NN", "in": "IN", "the": "DT",
    "evening": "NN", "and": "CC", "we": "PRP", "will": "MD",
    "play": "VB", "football": "NN", "together": "RB"
}

def pos_tag(sentence):
    words = sentence.split()
    tagged_sentence = []
    for word in words:
        if word in pos_dict:
            tagged_sentence.append((word, pos_dict[word]))
        else:
            tagged_sentence.append((word, "UNKNOWN"))
    return tagged_sentence

In [18]:
sentence = "I am going to the park with my friends tomorrow in the evening and we will play football together."
tagged_sentence = pos_tag(sentence)
tagged_sentence

[('I', 'PRP'),
 ('am', 'VBP'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('park', 'NN'),
 ('with', 'IN'),
 ('my', 'PRP$'),
 ('friends', 'NNS'),
 ('tomorrow', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('evening', 'NN'),
 ('and', 'CC'),
 ('we', 'PRP'),
 ('will', 'MD'),
 ('play', 'VB'),
 ('football', 'NN'),
 ('together.', 'UNKNOWN')]

## N-gram model based PoS tagging

In [19]:
from nltk.corpus import treebank
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger

# Load the Treebank corpus
nltk.download('treebank')
tagged_sentences = treebank.tagged_sents()

# Split the tagged sentences into training and testing sets
train_size = int(0.8 * len(tagged_sentences))
train_sents = tagged_sentences[:train_size]
test_sents = tagged_sentences[train_size:]

unigram_tagger = UnigramTagger(train_sents)
bigram_tagger = BigramTagger(train_sents)
trigram_tagger = TrigramTagger(train_sents)

# Evaluate taggers on test data
print("Unigram Tagger Accuracy:", unigram_tagger.evaluate(test_sents))
print("Bigram Tagger Accuracy:", bigram_tagger.evaluate(test_sents))
print("Trigram Tagger Accuracy:", trigram_tagger.evaluate(test_sents))

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\sejbp\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print("Unigram Tagger Accuracy:", unigram_tagger.evaluate(test_sents))


Unigram Tagger Accuracy: 0.8608213982733669


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print("Bigram Tagger Accuracy:", bigram_tagger.evaluate(test_sents))


Bigram Tagger Accuracy: 0.1132791057437996


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print("Trigram Tagger Accuracy:", trigram_tagger.evaluate(test_sents))


Trigram Tagger Accuracy: 0.06736863116922003


In [20]:
sentence = "I am going to the park with my friends tomorrow in the evening and we will play football together"

# Tag the sample sentence using each tagger
print("\nTagging the sample sentence using each tagger:")
print("Unigram Tagger:", unigram_tagger.tag(nltk.word_tokenize(sentence)))
print("Bigram Tagger:", bigram_tagger.tag(nltk.word_tokenize(sentence)))
print("Trigram Tagger:", trigram_tagger.tag(nltk.word_tokenize(sentence)))


Tagging the sample sentence using each tagger:
Unigram Tagger: [('I', 'PRP'), ('am', None), ('going', 'VBG'), ('to', 'TO'), ('the', 'DT'), ('park', 'NN'), ('with', 'IN'), ('my', 'PRP$'), ('friends', 'NNS'), ('tomorrow', 'NN'), ('in', 'IN'), ('the', 'DT'), ('evening', 'NN'), ('and', 'CC'), ('we', 'PRP'), ('will', 'MD'), ('play', 'VB'), ('football', 'NN'), ('together', 'RB')]
Bigram Tagger: [('I', 'PRP'), ('am', None), ('going', None), ('to', None), ('the', None), ('park', None), ('with', None), ('my', None), ('friends', None), ('tomorrow', None), ('in', None), ('the', None), ('evening', None), ('and', None), ('we', None), ('will', None), ('play', None), ('football', None), ('together', None)]
Trigram Tagger: [('I', 'PRP'), ('am', None), ('going', None), ('to', None), ('the', None), ('park', None), ('with', None), ('my', None), ('friends', None), ('tomorrow', None), ('in', None), ('the', None), ('evening', None), ('and', None), ('we', None), ('will', None), ('play', None), ('football', 