In [1]:
from collections import Counter, defaultdict

# ingest and preprocess train/dev/test data
def ingest(filename):
    data = []  # Initialize list to store data
    words = Counter()  # Initialize counter to count word frequency

    with open(filename, 'r') as file:
        for line in file:
            # remove leading/trailing whitespaces, convert to lowercase, and wrap with <s> and </s> tokens
            sentence = f"<s> {line.lower()} </s>".split()
            data.append(sentence) # append sentence to data list as a list of words
            words.update(sentence) # update word frequency counter

    return data, words

def preprocess_train(training_file):
    data, words = ingest(training_file)
    # replace all words that appear only once with <unk> token
    data = [[word if words[word] > 1 else "<unk>" for word in sentence] for sentence in data]
    # update word frequency counter with <unk> token and remove words that appear only once
    words = Counter(word for sentence in data for word in sentence)

    return data, words

def preprocess_dev_test(dev_test_file, words):
    data, _ = ingest(dev_test_file)
    # replace all words that are not in training data with <unk> token
    data = [[word if word in words else "<unk>" for word in sentence] for sentence in data]

    return data

train, train_words = preprocess_train('data/brown.train.txt')
dev = preprocess_dev_test('data/brown.dev.txt',train_words)
test = preprocess_dev_test('data/brown.test.txt', train_words)

In [2]:
import pickle
import os

def save_model(model, filename):
    with open(filename, 'wb') as file:  # 'wb' denotes write-binary mode
        pickle.dump(model, file)
    print(f"Model saved to {filename}")

def load_model(filename):
    with open(filename, 'rb') as file:  # 'rb' denotes read-binary mode
        model = pickle.load(file)
    return model

In [3]:
# unigram maximum likelihood model
def unigram(data, word_counts):
    words = [word for sentence in data for word in sentence] # Flatten the list of sentences into a list of words
    total_words = len(words)    # Calculate the total number of words
    model = {word: count / total_words for word, count in word_counts.items()} # Calculate the probability of each word

    return model

if os.path.exists("unigram_model.pkl"):
    unigram_model = load_model("unigram_model.pkl")
else:
    unigram_model = unigram(train, train_words)
    save_model(unigram_model, "unigram_model.pkl")

In [4]:
# bigram maximum likelihood model
def bigram(data):
    model = defaultdict(Counter)  # Initialize a default dictionary to store bigram counts

    for sentence in data:
        for w1, w2 in zip(sentence[:-1], sentence[1:]):
            model[w1][w2] += 1

    # Convert counts to probabilities
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2] /= total_count

    return model
    
if os.path.exists("bigram_model.pkl"):
    bigram_model = load_model("bigram_model.pkl")
else:
    bigram_model = bigram(train)
    save_model(bigram_model, "bigram_model.pkl")

In [5]:
def bigram_add1(data, unigram_model):
    V = len(unigram_model)  # Vocabulary size
    model = defaultdict(Counter)  # Initialize a default dictionary to store bigram probabilities

    for sentence in data:
        for w1, w2 in zip(sentence[:-1], sentence[1:]):
            model[w1][w2] += 1

    # Apply add-one smoothing
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2] = (model[w1][w2] + 1) / (total_count + V)

    return model

if os.path.exists("bigram_model_with_smoothing.pkl"):
    bigram_model_with_smoothing = load_model("bigram_model_with_smoothing.pkl")
else:
    bigram_model_with_smoothing = bigram_add1(train, unigram_model)
    save_model(bigram_model_with_smoothing, "bigram_model_with_smoothing.pkl")

In [6]:
def bigram_katz_backoff_discounting(data, unigram_model, add1_bigram_model, discount=0.5):
    model = defaultdict(Counter)  # Initialize a default dictionary to store bigram counts

    for sentence in data:
        for w1, w2 in zip(sentence[:-1], sentence[1:]):
            if w1 in add1_bigram_model and w2 in add1_bigram_model[w1]:
                # If the bigram is present in the add1_bigram_model, use it
                model[w1][w2] = add1_bigram_model[w1][w2]
            else:
                # If not, back off to unigram probabilities
                model[w1][w2] = unigram_model[w2]

    # Apply Katz backoff and discounting
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        unseen_count = len(unigram_model) - len(model[w1])  # Number of unseen bigrams
        for w2 in model[w1]:
            model[w1][w2] = (max(model[w1][w2] - discount, 0) / total_count) + (discount * (unseen_count / total_count))

    return model

if os.path.exists("katz_backoff_model.pkl"):
    katz_backoff_model = load_model("katz_backoff_model.pkl")
else:
    katz_backoff_model = bigram_katz_backoff_discounting(train, unigram_model, bigram_model_with_smoothing)
    save_model(katz_backoff_model, "katz_backoff_model.pkl")

In [7]:
print(bigram_model == bigram_model_with_smoothing)
print(bigram_model == katz_backoff_model)
print(bigram_model_with_smoothing == katz_backoff_model)

False
False
False


In [8]:
def count_types_tokens(data):
    # Flatten list to count all word occurrences
    all_words = [word for sentence in data for word in sentence]
    word_types = set(all_words)
    word_tokens = len(all_words)

    return len(word_types), word_tokens

train_types, train_tokens = count_types_tokens(train)
print(f"Unique word types (including '<unk>', '<s>', '</s>'): {train_types}")
print(f"Total word tokens: {train_tokens}")

Unique word types (including '<unk>', '<s>', '</s>'): 24796
Total word tokens: 1018784


In [9]:
def unseen_percentage(train_data, test_data):
    # Flatten the lists of words in both training and test data
    train_words = set(word for sentence in train_data for word in sentence)
    test_words = [word for sentence in test_data for word in sentence]

    # Count occurrences of "<unk>" in the test data
    unk_count = sum(1 for word in test_words if word == "<unk>")

    # Calculate the percentage of unseen words
    total_train_words = sum(len(sentence) for sentence in train_data)
    percentage_unseen_words = (unk_count / total_train_words) * 100

    # Count the number of unseen types
    unseen_types = set(word for word in test_words if word not in train_words)
    total_train_types = len(set(word for sentence in train_data for word in sentence))
    percentage_unseen_types = (len(unseen_types) / total_train_types) * 100

    return percentage_unseen_words, percentage_unseen_types

# Example usage:
percentage_unseen_words, percentage_unseen_types = unseen_percentage(train, test)
print(f"Percentage of unseen words in test data: {percentage_unseen_words:.2f}%")
print(f"Percentage of unseen types in test data: {percentage_unseen_types:.2f}%")

Percentage of unseen words in test data: 0.40%
Percentage of unseen types in test data: 0.00%


In [10]:
import math

def log_probability(sentence, model):
    tokens = sentence.lower().split()
    log_prob = 0
    for i in range(len(tokens) - 1):
        pair = (tokens[i], tokens[i+1])
        if isinstance(model, dict):
            # For simple bigram model
            if pair in model:
                prob = model[pair]
                log_prob += math.log(prob)
            else:
                log_prob += math.log(1e-10)  # Small constant to avoid log(0)
        elif isinstance(model, defaultdict):
            # For bigram models with smoothing or Katz backoff
            w1, w2 = pair
            if w1 in model and w2 in model[w1]:
                prob = model[w1][w2]
                log_prob += math.log(prob)
            else:
                # Handle unseen bigrams
                log_prob += math.log(1e-10)  # Small constant to avoid log(0)
        else:
            raise ValueError("Unsupported model type")

    return log_prob


sentence = "<s> he was laughed off the screen . </s>"
log_prob = log_probability(sentence, bigram_model)
print(f"Log probability of the sentence: {log_prob}")

log_prob = log_probability(sentence, bigram_model_with_smoothing)
print(f"Log probability of the sentence: {log_prob}")

log_prob = log_probability(sentence, katz_backoff_model)
print(f"Log probability of the sentence: {log_prob}")

Log probability of the sentence: -184.20680743952363
Log probability of the sentence: -184.20680743952363
Log probability of the sentence: -184.20680743952363


In [11]:
def perplexity(sentence, model):
    log_prob = log_probability(sentence, model)
    num_words = len(sentence.split()) - 1  # subtract one for the start symbol
    return math.exp(-log_prob / num_words)

sentence_perplexity = perplexity(sentence, bigram_model)
print(f"Perplexity of the sentence: {sentence_perplexity}")

sentence_perplexity = perplexity(sentence, bigram_model_with_smoothing)
print(f"Perplexity of the sentence: {sentence_perplexity}")

sentence_perplexity = perplexity(sentence, katz_backoff_model)
print(f"Perplexity of the sentence: {sentence_perplexity}")

Perplexity of the sentence: 9999999999.999968
Perplexity of the sentence: 9999999999.999968
Perplexity of the sentence: 9999999999.999968
