In [3]:
pip install spacy


Note: you may need to restart the kernel to use updated packages.


In [5]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     -------------- ------------------------- 4.7/12.8 MB 19.0 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 32.1 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [29]:
pip install nltk


Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp310-cp310-win_amd64.whl.metadata (41 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -------------------- ------------------- 0.8/1.5 MB 4.8 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 6.6 MB/s eta 0:00:00
Downloading regex-2024.11.6-cp310-cp310-win_amd64.whl (274 kB)
Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: regex, joblib, nltk
Successfully installed joblib-1.4.2 nltk-3.9.1 regex-2024.11.6
Note: you may need to restart the kernel to use updated packages.


# Task 1

In [3]:
import re
import spacy
from collections import defaultdict


nlp = spacy.load("en_core_web_sm")


with open("transcript.txt", "r") as f:# to clean data
    text = f.read()


text = re.sub(r"\[\w+\]", "", text)
text = re.sub(r"<.*?>", "", text)

doc = nlp(text)
sentences = [sent.text.lower() for sent in doc.sents]


bigram_counts = defaultdict(int)
word_counts = defaultdict(int)

for sent in sentences:
    words = sent.split()
    for i in range(len(words) - 1):
        bigram = (words[i], words[i+1])
        bigram_counts[bigram] += 1
        word_counts[words[i]] += 1

bigram_probs = {bigram: count / word_counts[bigram[0]] for bigram, count in bigram_counts.items()}


for bigram, prob in list(bigram_probs.items())[:10]:
    print(f"P({bigram[1]} | {bigram[0]}) = {prob:.4f}")


P(okay | 33_1_0001) = 1.0000
P(let's | okay) = 0.1189
P(see | let's) = 0.0446
P(i | see) = 0.0595
P(want | i) = 0.3230
P(to | want) = 0.6534
P(go | to) = 0.1355
P(to | go) = 0.3505
P(a | to) = 0.0284
P(thai | a) = 0.0228


# Task 2 Calculating the probabilities of given sentences

In [11]:
import re
from collections import Counter

corpus = [
    "Show me all the Arabic food restaurants",
    "I am learning mathematics",
    "I like Arabic food",
    "Find Arabic restaurants near me"
]


def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    return text.split()  # Split the sentence into words

# Tokenizing all sentences
tokenized_sentences = [preprocess(sent) for sent in corpus]

#  Generating all possible Bi-Grams
bigram_list = [(sent[i], sent[i+1]) for sent in tokenized_sentences for i in range(len(sent)-1)]

# Counting occurrences of Bi-Grams and Uni-Grams
bigram_counts = Counter(bigram_list)
unigram_counts = Counter([word for sent in tokenized_sentences for word in sent])

bigram_probs = {bigram: bigram_counts[bigram] / unigram_counts[bigram[0]] for bigram in bigram_counts}

# Function to calculate sentence probability using Bi-Grams
def sentence_probability(sentence, bigram_probs):
    words = preprocess(sentence)  # Clean and tokenize sentence
    prob = 1.0  # Start with probability 1

    for i in range(len(words) - 1):
        bigram = (words[i], words[i + 1])
        prob *= bigram_probs.get(bigram, 0)  # If Bi-Gram is missing, probability becomes 0

    return prob

sentences_to_check = [
    "show me all the Arabic food restaurants",
    "I am learning mathematics"
]

print("\n Sentence Probabilities:")
for sentence in sentences_to_check:
    prob = sentence_probability(sentence, bigram_probs)
    print(f"P(\"{sentence}\") = {prob:.10f}")





 Sentence Probabilities:
P("show me all the Arabic food restaurants") = 0.1666666667
P("I am learning mathematics") = 0.5000000000
