In [1]:
from collections import Counter, defaultdict

# ingest and preprocess train/dev/test data
def ingest(filename):
    data = []  # Initialize list to store data
    words = Counter()  # Initialize counter to count word frequency

    with open(filename, 'r') as file:
        for line in file:
            # remove leading/trailing whitespaces, convert to lowercase, and wrap with <s> and </s> tokens
            sentence = f"<s> {line.lower()} </s>".split()
            data.append(sentence) # append sentence to data list as a list of words
            words.update(sentence) # update word frequency counter

    return data, words

def preprocess_train(training_file):
    data, words = ingest(training_file)
    # replace all words that appear only once with <unk> token
    data = [[word if words[word] > 1 else "<unk>" for word in sentence] for sentence in data]
    # update word frequency counter with <unk> token and remove words that appear only once
    words = Counter(word for sentence in data for word in sentence)

    return data, words

def preprocess_dev_test(dev_test_file, words):
    data, _ = ingest(dev_test_file)
    # replace all words that are not in training data with <unk> token
    data = [[word if word in words else "<unk>" for word in sentence] for sentence in data]

    return data

train, train_words = preprocess_train('data/brown.train.txt')
dev = preprocess_dev_test('data/brown.dev.txt',train_words)
test = preprocess_dev_test('data/brown.test.txt', train_words)

In [4]:
import pickle
import os

def save_model(model, filename):
    with open(filename, 'wb') as file:  # 'wb' denotes write-binary mode
        pickle.dump(model, file)
    print(f"Model saved to {filename}")

def load_model(filename):
    with open(filename, 'rb') as file:  # 'rb' denotes read-binary mode
        model = pickle.load(file)
    return model

In [5]:
# bigram maximum likelihood model
def bigram(data):
    model = defaultdict(Counter)  # Initialize a default dictionary to store bigram counts

    for sentence in data:
        for w1, w2 in zip(sentence[:-1], sentence[1:]):
            model[w1][w2] += 1

    # Convert counts to probabilities
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2] /= total_count

    return model
    
if os.path.exists("bigram_model.pkl"):
    bigram_model = load_model("bigram_model.pkl")
else:
    bigram_model = bigram(train)
    save_model(bigram_model, "bigram_model.pkl")

Model saved to bigram_model.pkl


In [None]:
def bigram_add1(data):
    model = defaultdict(Counter)  # Initialize a default dictionary to store bigram counts

    for sentence in data:
        for w1, w2 in zip(sentence[:-1], sentence[1:]):
            model[w1][w2] += 1

    # Collect all unique words in data to form the vocabulary
    vocabulary = set(word for sentence in data for word in sentence)
    vocab_size = len(vocabulary)  # Size of the vocabulary

    # Apply add-one smoothing
    for w1 in model.keys():
        total_count = sum(model[w1].values()) + vocab_size  # Adjust total count for smoothing
        for w2 in vocabulary:
            model[w1][w2] = (model[w1].get(w2, 0) + 1) / total_count

    return model

if os.path.exists("bigram_model_with_smoothing.pkl"):
    bigram_model_with_smoothing = load_model("bigram_model_with_smoothing.pkl")
else:
    bigram_model_with_smoothing = bigram_add1(train)
    save_model(bigram_model_with_smoothing, "bigram_model_with_smoothing.pkl")