In [None]:
import pandas as pd
import re
import nltk
from collections import defaultdict, Counter

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
df = pd.read_csv("Twitter_Data.csv")
print(df.head())

                                          clean_text  category
0  when modi promised â€œminimum government maximum...      -1.0
1  talk all the nonsense and continue all the dra...       0.0
2  what did just say vote for modi  welcome bjp t...       1.0
3  asking his supporters prefix chowkidar their n...       1.0
4  answer who among these the most powerful world...       1.0


In [None]:
def clean_tweet(text):
    # Ensure the input is treated as a string, handling non-string types gracefully
    text = str(text) if not isinstance(text, str) else text
    text = re.sub(r"http\S+", "", text)   # remove URLs
    text = re.sub(r"@\w+", "", text)      # remove mentions
    text = re.sub(r"#\w+", "", text)      # remove hashtags
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text.lower()

df["clean_text"] = df["clean_text"].apply(clean_tweet)

In [None]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
tagged_sentences = []

for tweet in df["clean_text"].head(500):
    tokens = nltk.word_tokenize(tweet)
    tags = nltk.pos_tag(tokens)
    if tags:
        tagged_sentences.append(tags)

print(tagged_sentences[0])


[('when', 'WRB'), ('modi', 'NN'), ('promised', 'VBD'), ('minimum', 'JJ'), ('government', 'NN'), ('maximum', 'JJ'), ('governance', 'NN'), ('expected', 'VBD'), ('him', 'PRP'), ('begin', 'VB'), ('the', 'DT'), ('difficult', 'JJ'), ('job', 'NN'), ('reforming', 'VBG'), ('the', 'DT'), ('state', 'NN'), ('why', 'WRB'), ('does', 'VBZ'), ('take', 'VB'), ('years', 'NNS'), ('get', 'VB'), ('justice', 'NN'), ('state', 'NN'), ('should', 'MD'), ('and', 'CC'), ('not', 'RB'), ('business', 'NN'), ('and', 'CC'), ('should', 'MD'), ('exit', 'VB'), ('psus', 'NN'), ('and', 'CC'), ('temples', 'NNS')]


In [None]:
nltk.data.path.append('/usr/local/share/nltk_data')


In [None]:
transition = defaultdict(Counter)

for sent in tagged_sentences:
    for i in range(len(sent)-1):
        t1 = sent[i][1]
        t2 = sent[i+1][1]
        transition[t1][t2] += 1


In [None]:
transition_prob = {}

for tag in transition:
    total = sum(transition[tag].values())
    transition_prob[tag] = {t: c/total for t,c in transition[tag].items()}

print(list(transition_prob.items())[:3])


[('WRB', {'NN': 0.23958333333333334, 'VBZ': 0.010416666666666666, 'JJS': 0.010416666666666666, 'JJ': 0.20833333333333334, 'VBN': 0.052083333333333336, 'VBP': 0.08333333333333333, 'PRP': 0.0625, 'DT': 0.0625, 'RB': 0.09375, 'VB': 0.020833333333333332, 'NNS': 0.052083333333333336, 'MD': 0.0625, 'PRP$': 0.03125, 'EX': 0.010416666666666666}), ('NN', {'VBD': 0.061129090255303845, 'JJ': 0.05537576411362819, 'VBG': 0.02481121898597627, 'WRB': 0.011506652283351312, 'NN': 0.37648327939590076, 'MD': 0.03128371089536138, 'CC': 0.04782452355267889, 'PDT': 0.0010787486515641855, 'IN': 0.10176195613088818, 'VBP': 0.028047464940668825, 'RB': 0.05034160373966199, 'WP': 0.011147069399496584, 'WDT': 0.006112909025530385, 'VBZ': 0.042790363178712695, 'DT': 0.02624955052139518, 'NNS': 0.061848256023013304, 'VB': 0.015102481121898598, 'FW': 0.003595828838547285, 'PRP': 0.016181229773462782, 'PRP$': 0.005034160373966199, 'CD': 0.0025170801869830997, 'VBN': 0.010068320747932399, 'RP': 0.0010787486515641855, 

In [None]:
emission = defaultdict(Counter)

for sent in tagged_sentences:
    for word, tag in sent:
        emission[tag][word] += 1

emission_prob = {}
for tag in emission:
    total = sum(emission[tag].values())
    emission_prob[tag] = {w: c/total for w,c in emission[tag].items()}


In [None]:
word_freq = Counter()

for sent in tagged_sentences:
    for w,t in sent:
        word_freq[w] += 1

rare = [w for w in word_freq if word_freq[w] == 1]
print("Rare words:", rare[:10])


Rare words: ['maximum', 'difficult', 'reforming', 'temples', 'drama', 'main', 'relax', 'prefix', 'names', 'service']


In [None]:
for tag in list(transition_prob.keys())[:3]:
    print(tag, transition_prob[tag])


WRB {'NN': 0.23958333333333334, 'VBZ': 0.010416666666666666, 'JJS': 0.010416666666666666, 'JJ': 0.20833333333333334, 'VBN': 0.052083333333333336, 'VBP': 0.08333333333333333, 'PRP': 0.0625, 'DT': 0.0625, 'RB': 0.09375, 'VB': 0.020833333333333332, 'NNS': 0.052083333333333336, 'MD': 0.0625, 'PRP$': 0.03125, 'EX': 0.010416666666666666}
NN {'VBD': 0.061129090255303845, 'JJ': 0.05537576411362819, 'VBG': 0.02481121898597627, 'WRB': 0.011506652283351312, 'NN': 0.37648327939590076, 'MD': 0.03128371089536138, 'CC': 0.04782452355267889, 'PDT': 0.0010787486515641855, 'IN': 0.10176195613088818, 'VBP': 0.028047464940668825, 'RB': 0.05034160373966199, 'WP': 0.011147069399496584, 'WDT': 0.006112909025530385, 'VBZ': 0.042790363178712695, 'DT': 0.02624955052139518, 'NNS': 0.061848256023013304, 'VB': 0.015102481121898598, 'FW': 0.003595828838547285, 'PRP': 0.016181229773462782, 'PRP$': 0.005034160373966199, 'CD': 0.0025170801869830997, 'VBN': 0.010068320747932399, 'RP': 0.0010787486515641855, 'RBR': 0.00

In [None]:
test = "love this movie"
tokens = nltk.word_tokenize(test)
print(tokens)


['love', 'this', 'movie']


In [None]:
# V1(tag) = P(tag) * P(word | tag)

In [None]:
# V2(tag2) = max [ V1(tag1) * P(tag2|tag1) * P(word2|tag2) ]

In [None]:
emission_prob
transition_prob


{'WRB': {'NN': 0.23958333333333334,
  'VBZ': 0.010416666666666666,
  'JJS': 0.010416666666666666,
  'JJ': 0.20833333333333334,
  'VBN': 0.052083333333333336,
  'VBP': 0.08333333333333333,
  'PRP': 0.0625,
  'DT': 0.0625,
  'RB': 0.09375,
  'VB': 0.020833333333333332,
  'NNS': 0.052083333333333336,
  'MD': 0.0625,
  'PRP$': 0.03125,
  'EX': 0.010416666666666666},
 'NN': {'VBD': 0.061129090255303845,
  'JJ': 0.05537576411362819,
  'VBG': 0.02481121898597627,
  'WRB': 0.011506652283351312,
  'NN': 0.37648327939590076,
  'MD': 0.03128371089536138,
  'CC': 0.04782452355267889,
  'PDT': 0.0010787486515641855,
  'IN': 0.10176195613088818,
  'VBP': 0.028047464940668825,
  'RB': 0.05034160373966199,
  'WP': 0.011147069399496584,
  'WDT': 0.006112909025530385,
  'VBZ': 0.042790363178712695,
  'DT': 0.02624955052139518,
  'NNS': 0.061848256023013304,
  'VB': 0.015102481121898598,
  'FW': 0.003595828838547285,
  'PRP': 0.016181229773462782,
  'PRP$': 0.005034160373966199,
  'CD': 0.002517080186983