<a href="https://colab.research.google.com/github/Sammodi0711/NLP-Sem-1/blob/main/S25MCAG0019_lab4_b1_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Q1. Preprocessing and Tokenization
# •	Load the news category from the Brown corpus.
# •	Clean the text (lowercasing, removing punctuation using re).
# •	Tokenize the text into sentences and words.
# •	Report:
# 1.	Total number of sentences.
# 2.	Total number of words.
# 3.	Vocabulary size (unique words).

import nltk
import re
from nltk.corpus import brown
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('brown')
nltk.download('punkt_tab')

news_text = brown.words(categories='news')
news_raw = " ".join(news_text)

clean_text = re.sub(r'[^a-zA-Z\s]', '', news_raw.lower())

sentences = sent_tokenize(clean_text)

words = word_tokenize(clean_text)

total_sentences = len(sentences)
total_words = len(words)
vocab_size = len(set(words))

print("Total number of sentences:", total_sentences)
print("Total number of words:", total_words)
print("Vocabulary size (unique words):", vocab_size)

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Total number of sentences: 1
Total number of words: 87019
Vocabulary size (unique words): 12131


In [None]:
# Q2. Building N-grams
# •	Write Python functions to generate unigrams, bigrams, and trigrams from the corpus tokens.
# •	Print the 10 most frequent bigrams and trigrams along with their counts.

from nltk.util import ngrams
from nltk import FreqDist

news_text = brown.words(categories='news')
news_raw = " ".join(news_text)

clean_text = re.sub(r'[^a-zA-Z\s]', '', news_raw.lower())
tokens = word_tokenize(clean_text)

def generate_unigrams(tokens):
    return list(ngrams(tokens, 1))

def generate_bigrams(tokens):
    return list(ngrams(tokens, 2))

def generate_trigrams(tokens):
    return list(ngrams(tokens, 3))

unigrams = generate_unigrams(tokens)
bigrams = generate_bigrams(tokens)
trigrams = generate_trigrams(tokens)

bigram_freq = FreqDist(bigrams)
trigram_freq = FreqDist(trigrams)

print("Top 10 Bigrams:")
for pair, count in bigram_freq.most_common(10):
    print(pair, ":", count)

print("\nTop 10 Trigrams:")
for triplet, count in trigram_freq.most_common(10):
    print(triplet, ":", count)

Top 10 Bigrams:
('of', 'the') : 850
('in', 'the') : 610
('to', 'the') : 279
('on', 'the') : 254
('for', 'the') : 223
('at', 'the') : 199
('will', 'be') : 157
('that', 'the') : 149
('with', 'the') : 142
('and', 'the') : 141

Top 10 Trigrams:
('one', 'of', 'the') : 44
('mr', 'and', 'mrs') : 42
('the', 'united', 'states') : 37
('members', 'of', 'the') : 28
('president', 'of', 'the') : 22
('a', 'number', 'of') : 19
('the', 'white', 'house') : 19
('as', 'a', 'result') : 18
('some', 'of', 'the') : 18
('the', 'u', 's') : 17


In [None]:
# Q3. Calculating Conditional Probabilities
# •	Using the Markov Chain assumption and conditional probability, calculate:
# o	P (word₂ | word₁) for bigrams
# o	P (word₃ | word₁, word₂) for trigrams
# •	Write a function:
# def bigram_prob (w1, w2, corpus):
#      # returns P(w2|w1)

bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))

unigram_freq = FreqDist(tokens)
bigram_freq = FreqDist(bigrams)
trigram_freq = FreqDist(trigrams)

def bigram_prob(w1, w2):
    """ P(w2 | w1) = Count(w1,w2) / Count(w1) """
    bigram_count = bigram_freq[(w1, w2)]
    unigram_count = unigram_freq[w1]
    if unigram_count == 0:
        return 0
    return bigram_count / unigram_count

def trigram_prob(w1, w2, w3):
    """ P(w3 | w1,w2) = Count(w1,w2,w3) / Count(w1,w2) """
    trigram_count = trigram_freq[(w1, w2, w3)]
    bigram_count = bigram_freq[(w1, w2)]
    if bigram_count == 0:
        return 0
    return trigram_count / bigram_count

print("P('the' | 'in') =", bigram_prob("in", "the"))
print("P('states' | 'the','united') =", trigram_prob("the", "united", "states"))

P('the' | 'in') = 0.30198019801980197
P('states' | 'the','united') = 0.74


In [None]:
# Q4. Sentence Probability
# •	Write a function to calculate the probability of a given sentence using:
# o	Bigram model
# •	Example sentence: "the president of the company"
# •	Compare both results.

unigram_freq = FreqDist(tokens)
bigram_freq = FreqDist(list(ngrams(tokens, 2)))

def bigram_prob(w1, w2):
    bigram_count = bigram_freq[(w1, w2)]
    unigram_count = unigram_freq[w1]
    if unigram_count == 0:
        return 0
    return bigram_count / unigram_count

def sentence_prob_bigram(sentence):
    words = word_tokenize(sentence.lower())
    prob = 1.0
    for i in range(1, len(words)):
        p = bigram_prob(words[i-1], words[i])
        if p == 0:
            return 0
        prob *= p
    return prob

sent = "the president of the company"
print("Sentence:", sent)
print("Bigram model probability:", sentence_prob_bigram(sent))

Sentence: the president of the company
Bigram model probability: 9.05010204014703e-07
