In [7]:
import random
import re
from collections import defaultdict

# Step 1: Read text from "train.txt" file
with open('train.txt', 'r') as file:
    corpus = file.read()

# Preprocessing: Remove special characters except '.', ',', '!', and numbers
corpus = re.sub(r'[^a-zA-Z.,! ]', '', corpus)

# Step 2: Convert to lowercase and tokenize the text into words
corpus = corpus.lower()
words = corpus.split()

print(words)

# Step 3.1: Calculate unigram frequencies
word_freq = defaultdict(int)
for word in words:
    word_freq[word] += 1

# Step 3: Calculate bigram frequencies
bigram_freq = defaultdict(int)
for i in range(len(words) - 1):
    bigram = (words[i], words[i + 1])
    bigram_freq[bigram] += 1

print(bigram_freq)

# Step 4.1: Compute the probability distribution of words (unigram model)
total_words = len(words)
word_probs = {word: freq / total_words for word, freq in word_freq.items()}

# Step 4.2: Compute the probability distribution of bigrams (bigram model)
total_bigrams = len(words) - 1
bigram_probs = {(prev_word, word): freq / word_freq[prev_word] for (prev_word, word), freq in bigram_freq.items()}

#print(bigram_probs)

print("P(the|like) =", bigram_probs.get(("like", "the"), 0))
print("P(students|the) =", bigram_probs.get(("the", "students"), 0))

# Step 5.1: Generate random sentences using the unigram model
def generate_unigram_sentence():
    sentence = []
    while True:
        word = random.choices(list(word_probs.keys()), list(word_probs.values()))[0]
        if word == '.':
            if len(sentence) >= 2:
                break
            else:
                continue
        sentence.append(word)
    return ' '.join(sentence)

# Step 5.2: Generate random sentences using the bigram model
def generate_bigram_sentence():
    sentence = []
    while True:
        if not sentence:
            # Start with a random word as the first word
            word = random.choice(words)
            sentence.append(word)
        else:
            # Select the next word based on the previous word (bigram)
            prev_word = sentence[-1]
            next_word_candidates = [bigram[1] for bigram in bigram_probs if bigram[0] == prev_word]
            if not next_word_candidates:
                break  # If there are no valid next words, end the sentence
            word = random.choices(next_word_candidates, [bigram_probs[(prev_word, w)] for w in next_word_candidates])[0]
            sentence.append(word)

        if word == '.':
            break

    return ' '.join(sentence)

for _ in range(5):
    print(generate_unigram_sentence())
    print(generate_bigram_sentence())



P(the|like) = 0.09722222222222222
P(students|the) = 0
, walk saying for
different hotel is kind and couples or just what we had .
, large the the ! mile now got right and location my one was which complimentary , has at rooms a in no been
is the room .
for they the the of dripped day lead might ! had
as i love to remember being not happy to have been restored the staff was great hotel .
hotel , area
told me and service is comfortable , we had to the sign that it was one thing , they treated like for my tv reception in there again !in the cd purchased it will say about it would not impressed with clean and site either midway airport is a free breakfast in the noise will receive a nice .
was they to and spa matching wine when bare even really but of pushed never expect hotels delightful walking by absolutely or are , the good the
found it as bottled water possible gangrelated shooting took in this unpleasant experience with the beds and has a large but did nt think it was alright , chees