In [7]:
import collections

# ingest and preprocess train/dev/test data
def ingest(filename):
    data = []
    words = collections.Counter()  # Initialize counter to count word frequency

    with open(filename, 'r') as file:
        for line in file:
            sentence = line.strip().lower().split()  # remove leading/trailing whitespaces, convert to lowercase, and split into tokens
            data.append(sentence)
            words.update(sentence)

    data = [sentence for sentence in data if sentence]  # remove empty sentences
    del words['']  # remove empty string from word counter

    return data, words

def preprocess_train(training_file):
    data, words = ingest(training_file)
    # replace all words that appear only once with <unk> token
    data = [['<unk>' if words[word] == 1 else word for word in sentence] for sentence in data]

    return data, words

def preprocess_dev_test(dev_test_file, words):
    data, _ = ingest(dev_test_file)
    # replace all words that are not in training data with <unk> token
    data = [['<unk>' if word not in words else word for word in sentence] for sentence in data]

    return data

train, train_words = preprocess_train('data/brown.train.txt')
dev = preprocess_dev_test('data/brown.dev.txt',train_words)
test = preprocess_dev_test('data/brown.test.txt', train_words)