# Čtvrté cvičení - vyhlazování jazykového modelu
- Vytvořte bigramový jazykový model vyhlazený metodou Witten-Bell pro soubor TEXTEN1
- Soubor přeuspořádejte tak, aby každá věta byla na jednom řádku (jako oddělovač vět použijte tečku, otazník a vykřičník).
- Všechny slova převeďte na lowercase
- Uvažujte na začátku věty startovací symbol <s>

In [21]:
import re, string
from collections import Counter

def text_to_lines(text):
    text_lined = []
    line = ["<s>"]
    for word in text:
        word_stripped = word.strip()
        if word_stripped in (".", "?", "!"):
            text_lined.append(line)
            line = ["<s>"]
        elif word_stripped not in string.punctuation:
            line.append(word_stripped.translate(str.maketrans('', '', string.punctuation)).lower())
    return text_lined

text_en = []
with open('TEXTEN1.txt', 'r', encoding='UTF-8') as file_en:
    text_en = file_en.readlines()

text_en_lined = text_to_lines(text_en)

pairs_counts = Counter()
words_counts = Counter()

for words in text_en_lined:
    words_counts.update(words)
    pairs = [f"{words[i]} {words[i+1]}" for i in range(len(words)-1)]
    pairs_counts.update(pairs)

vocab_size = len(words_counts)

en_bigram_model = {}
for word, total_word_occurrences in words_counts.items():
    possible_next_words = [pair.split()[1] for pair in pairs_counts if pair.startswith(word)]
    total_different_pairs = len(set(possible_next_words))
    total_unseen_pairs = vocab_size - total_different_pairs

    for next_word in set(possible_next_words):
        bigram_str = f"{word} {next_word}"
        bigram_count = pairs_counts[bigram_str]

        # Check if the current bigram count is greater than 0
        if bigram_count > 0:
            smoothed_count = bigram_count / (total_word_occurrences + total_different_pairs)
        else:
            smoothed_count = total_different_pairs / (total_word_occurrences + total_different_pairs)

        en_bigram_model[bigram_str] = smoothed_count

print(en_bigram_model["of the"])
# 0.258387066629287
print(en_bigram_model["of of"])
# 1.6608937523330386e-05

0.25493269408076713
0.0


In [None]:
import re, string
from collections import Counter
text_en = []
with open('TEXTEN1.txt', 'r', encoding='UTF-8') as file_en:
    while line := file_en.readline():
        text_en.append(line)
        
def text_to_lines(text):
    text_lined = []
    line = ["<s>"]
    for word in text:
        if word.strip() in ".?!":
            text_lined.append(line)
            line = ["<s>"]
            continue
        if word.strip() in string.punctuation:
            continue
        line.append(word.strip().translate(str.maketrans('', '', string.punctuation)).lower())
    return text_lined

text_en_lined = text_to_lines(text_en)

pairs_counts = Counter()
words_counts = Counter()

for words in text_en_lined:
    words_counts.update(Counter(words))
    pairs = [f'{words[i]} {words[i+1]}' for i in range(len(words)-1)]
    pairs_counts.update(Counter(pairs))
words_count = sum(words_counts.values())
pairs_count = sum(pairs_counts.values())

vocab_size = len(words_counts)

en_bigram_model = {}
for i, word in enumerate(list(words_counts.keys())[0:-1]):
    for j, word_2 in enumerate(list(words_counts.keys())[0:-1]):
        bigram_str = f"{word} {word_2}"
        if bigram_str in pairs_counts.keys():
            en_bigram_model[bigram_str] = pairs_counts[bigram_str]
        else:
            en_bigram_model[bigram_str] = 0
        bigram_str = f"{word_2} {word}"
        if bigram_str in pairs_counts.keys():
            en_bigram_model[bigram_str] = pairs_counts[bigram_str]
        else:
            en_bigram_model[bigram_str] = 0

for word in words_counts.keys():
    total_word_occurrences = words_counts[word]
    possible_next_words = [pair.split()[1] for pair in en_bigram_model if pair.startswith(word)]
    unseen_count = vocab_size - len(set(possible_next_words))
    lambda_factor = unseen_count / (unseen_count + total_word_occurrences)

    for next_word in set(possible_next_words):
        bigram_str = f"{word} {next_word}"
        smoothed_count = lambda_factor * (1/vocab_size)
        if bigram_str in pairs_counts:
            smoothed_count += (1 - lambda_factor) * pairs_counts[bigram_str] / total_word_occurrences
        en_bigram_model[bigram_str] = smoothed_count

print(en_bigram_model["of the"])
#0.258387066629287
print(en_bigram_model["of of"])
#1.6608937523330386e-05