# Task 1

## Installation

In [1]:
!pip install nltk



## Imports

In [2]:
import nltk
import re
from collections import Counter
import pandas as pd
import string
from nltk.tokenize import TreebankWordTokenizer

## Extraction and Cleaning

In [3]:
tokenizer = TreebankWordTokenizer()

file_path = "transcript.txt"

with open(file_path, "r", encoding="utf-8") as file:
    lines = file.readlines()

cleaned_sentences = []
for line in lines:
    # Remove identifier (first word of each line)
    line = re.sub(r"^\d+_\d+_\d+\s+", "", line)
    
    # Remove weird characters
    line = re.sub(r"\[.*?\]|\<.*?\>", "", line)

    # Append cleaned sentence
    cleaned_sentences.append(line.strip().lower())

# remove the full stops
cleaned_sentences = [re.sub(r'[^\w\s]','',sentence) for sentence in cleaned_sentences]

cleaned_sentences[:5]



['okay lets see i want to go to a thai restaurant   with less than ten dollars per person',
 'i like to eat at lunch time  so that would be eleven a__m to one p__m',
 'i dont want to walk for more than five minutes',
 'tell me more about the  na nakapan  restaurant on martin luther king',
 'i like to go to a hamburger restaurant']

## Tokenisation and Counting Freq

In [4]:
# Tokenize all sentences
tokens = [tokenizer.tokenize(sentence) for sentence in cleaned_sentences]

# Flatten the list of tokens
tokens = [word for sentence in tokens for word in sentence]

# Compute unigram frequencies
unigram_counts = Counter(tokens)

# Compute bigram frequencies
bigrams = list(nltk.bigrams(tokens))
bigram_counts = Counter(bigrams)

# Display sample counts
print("Sample Uni-gram Counts:", unigram_counts.most_common(5))
print("Sample Bi-gram Counts:", bigram_counts.most_common(5))


Sample Uni-gram Counts: [('i', 2817), ('to', 2716), ('like', 1528), ('food', 1252), ('about', 1157)]
Sample Bi-gram Counts: [(('like', 'to'), 1186), (('i', 'want'), 911), (('to', 'eat'), 762), (('i', 'would'), 741), (('would', 'like'), 715)]


## Bi-gram FOR REAL

In [5]:
# Formula P(w_n | w_{n-1}) = Count(w_{n-1}, w_n) / Count(w_{n-1})

bigram_probabilities = {}

for (w1, w2), count in bigram_counts.items():
    bigram_probabilities[(w1, w2)] = count / unigram_counts[w1]

for bigram, prob in list(bigram_probabilities.items())[:5]:
    print(f"P({bigram[1]} | {bigram[0]}) = {prob:.4f}")


P(lets | okay) = 0.1193
P(see | lets) = 0.0473
P(i | see) = 0.0588
P(want | i) = 0.3234
P(to | want) = 0.6586


## Saving

In [6]:
df = pd.DataFrame(
    [(w1, w2, prob) for (w1, w2), prob in bigram_probabilities.items()],
    columns=["First_Word", "Second_Word", "Probability"]
)

df.to_csv("bigram_probabilities.csv", index=False)

print("Bi-gram probabilities saved to 'bigram_probabilities.csv'.")


Bi-gram probabilities saved to 'bigram_probabilities.csv'.


# Task 2

In [7]:
import numpy as np
def calculate_sentence_probability(sentence, bigram_probabilities):
    tokens = tokenizer.tokenize(sentence.lower())
    
    # Generate tokens
    bigrams = list(nltk.bigrams(tokens))
    
    bigram_probs = []
    
    for bigram in bigrams:
        if bigram in bigram_probabilities:
            bigram_probs.append(bigram_probabilities[bigram])
        else:
            # just in case any bigram is missing
            bigram_probs.append(1e-6)
    
    sentence_probability = np.prod(bigram_probs)
    
    return sentence_probability, bigram_probs


In [None]:
sentence_1 = "show me all the Arabic food restaurants"
sentence_2 = "I am learning mathematics"

prob_1, bigram_probs_1 = calculate_sentence_probability(sentence_1, bigram_probabilities)
prob_2, bigram_probs_2 = calculate_sentence_probability(sentence_2, bigram_probabilities)

print(f"P({sentence_1}) = {prob_1}")
print(f"P({sentence_2}) = {prob_2}")

P(show me all the Arabic food restaurants) = 1.3310537523662384e-11
P(I am learning mathematics) = 9.939652112176074e-15
