In [2]:
from collections import Counter, defaultdict

# ingest and preprocess train/dev/test data
def ingest(filename):
    data = []  # Initialize list to store data
    words = Counter()  # Initialize counter to count word frequency

    with open(filename, 'r') as file:
        for line in file:
            # remove leading/trailing whitespaces, convert to lowercase, and wrap with <s> and </s> tokens
            sentence = f"<s> {line.lower()} </s>".split()
            data.append(sentence) # append sentence to data list as a list of words
            words.update(sentence) # update word frequency counter

    return data, words

def preprocess_train(training_file):
    data, words = ingest(training_file)
    # replace all words that appear only once with <unk> token
    data = [[word if words[word] > 1 else "<unk>" for word in sentence] for sentence in data]
    # update word frequency counter with <unk> token and remove words that appear only once
    words = Counter(word for sentence in data for word in sentence)

    return data, words

def preprocess_dev_test(dev_test_file, words):
    data, _ = ingest(dev_test_file)
    # replace all words that are not in training data with <unk> token
    data = [[word if word in words else "<unk>" for word in sentence] for sentence in data]

    return data

train, train_words = preprocess_train('data/brown.train.txt')
dev = preprocess_dev_test('data/brown.dev.txt',train_words)
test = preprocess_dev_test('data/brown.test.txt', train_words)

In [6]:
import pickle
import os

def save_model(model, filename):
    with open(filename, 'wb') as file:  # 'wb' denotes write-binary mode
        pickle.dump(model, file)
    print(f"Model saved to {filename}")

def load_model(filename):
    with open(filename, 'rb') as file:  # 'rb' denotes read-binary mode
        model = pickle.load(file)
    return model

In [7]:
# unigram maximum likelihood model
def unigram(data, word_counts):
    words = [word for sentence in data for word in sentence] # Flatten the list of sentences into a list of words
    total_words = len(words)    # Calculate the total number of words
    model = {word: count / total_words for word, count in word_counts.items()} # Calculate the probability of each word

    return model

if os.path.exists("unigram_model.pkl"):
    unigram_model = load_model("unigram_model.pkl")
else:
    unigram_model = unigram(train, train_words)
    save_model(unigram_model, "unigram_model.pkl")

Model saved to unigram_model.pkl


In [8]:
# bigram maximum likelihood model
def bigram(data):
    model = defaultdict(Counter)  # Initialize a default dictionary to store bigram counts

    for sentence in data:
        for w1, w2 in zip(sentence[:-1], sentence[1:]):
            model[w1][w2] += 1

    # Convert counts to probabilities
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2] /= total_count

    return model
    
if os.path.exists("bigram_model.pkl"):
    bigram_model = load_model("bigram_model.pkl")
else:
    bigram_model = bigram(train)
    save_model(bigram_model, "bigram_model.pkl")

In [6]:
def bigram_add1(data):
    model = defaultdict(Counter)  # Initialize a default dictionary to store bigram counts

    for sentence in data:
        for w1, w2 in zip(sentence[:-1], sentence[1:]):
            model[w1][w2] += 1

    # Collect all unique words in data to form the vocabulary
    vocabulary = set(word for sentence in data for word in sentence)
    vocab_size = len(vocabulary)  # Size of the vocabulary

    # Apply add-one smoothing
    for w1 in model.keys():
        total_count = sum(model[w1].values()) + vocab_size  # Adjust total count for smoothing
        for w2 in vocabulary:
            model[w1][w2] = (model[w1].get(w2, 0) + 1) / total_count

    return model

if os.path.exists("bigram_model_with_smoothing.pkl"):
    bigram_model_with_smoothing = load_model("bigram_model_with_smoothing.pkl")
else:
    bigram_model_with_smoothing = bigram_add1(train)
    save_model(bigram_model_with_smoothing, "bigram_model_with_smoothing.pkl")

MemoryError: 

In [None]:
def bigram_discount(data):
    model = defaultdict(Counter)
    total_unigrams = Counter()

    # Count bigrams and unigrams
    for sentence in data:
        for w1, w2 in zip(sentence[:-1], sentence[1:]):
            model[w1][w2] += 1
            total_unigrams[w1] += 1

    # Total count of all words in the dataset for backoff unigram probabilities
    total_words = sum(total_unigrams.values())

    # Apply discounting with a backoff to unigram probabilities
    bigram_discounted = defaultdict(dict)
    for w1 in model.keys():
        total_count_w1 = total_unigrams[w1]
        bigram_discounted[w1] = defaultdict(float)

        # Calculate bigram probabilities with discounting
        for w2 in model[w1]:
            bigram_discounted[w1][w2] = (model[w1][w2] - 0.5) / total_count_w1

        # Add backoff to unigrams if necessary
        sum_prob = sum(bigram_discounted[w1].values())
        if sum_prob < 1.0:
            # Distribute remaining probability based on unigram distribution
            remaining_prob = 1.0 - sum_prob
            for w2 in total_unigrams:
                bigram_discounted[w1][w2] += remaining_prob * (total_unigrams[w2] / total_words)

    return bigram_discounted

if os.path.exists("bigram_model_discounted.pkl"):
    bigram_model_discounted = load_model("bigram_model_discounted.pkl")
else:
    bigram_model_discounted = bigram_discount(train)
    save_model(bigram_model_discounted, "bigram_model_discounted.pkl")

In [None]:
def count_types_tokens(data):
    # Flatten list to count all word occurrences
    all_words = [word for sentence in data for word in sentence]
    word_types = set(all_words)
    word_tokens = len(all_words)

    return len(word_types), word_tokens

train_types, train_tokens = count_types_tokens(train)
print(f"Unique word types (including '<unk>', '<s>', '</s>'): {train_types}")
print(f"Total word tokens: {train_tokens}")

In [None]:
def unseen_percentage(train_data, test_data):
    train_words = set(word for sentence in train_data for word in sentence)
    test_words = [word for sentence in test_data for word in sentence]

    unseen_tokens = sum(1 for word in test_words if word not in train_words)
    unseen_types = len(set(test_words) - train_words)

    token_percentage = unseen_tokens / len(test_words) * 100
    type_percentage = unseen_types / len(set(test_words)) * 100

    return token_percentage, type_percentage

test_token_percentage, test_type_percentage = unseen_percentage(train, test)
print(f"Percentage of unseen tokens in test data: {test_token_percentage:.2f}%")
print(f"Percentage of unseen types in test data: {test_type_percentage:.2f}%")

In [None]:
import math

def log_probability(sentence, model):
    tokens = sentence.lower().split()
    log_prob = 0
    for i in range(len(tokens) - 1):
        pair = (tokens[i], tokens[i+1])
        if pair in model:
            prob = model[pair]
            log_prob += math.log(prob)
        else:
            log_prob += math.log(1e-10)  # Small constant to avoid log(0)

    return log_prob

sentence = "<s> he was laughed off the screen . </s>"
log_prob = log_probability(sentence, bigram_model_with_smoothing)
print(f"Log probability of the sentence: {log_prob}")

In [None]:
def perplexity(sentence, model):
    log_prob = log_probability(sentence, model)
    num_words = len(sentence.split()) - 1  # subtract one for the start symbol
    return math.exp(-log_prob / num_words)

sentence_perplexity = perplexity(sentence, bigram_model_with_smoothing)
print(f"Perplexity of the sentence: {sentence_perplexity}")