In [1]:
import re
import math
import numpy as np
import copy

# used for unseen words in training vocabularies
UNK = None
# sentence start and end
SENTENCE_START = "<s>"
SENTENCE_END = "</s>"

In [2]:
def read_sentences_from_file(file_path):
    '''
        read the files.
    '''
    with open(file_path, "r") as f:
        return [re.split("\s+", line.rstrip('\n')) for line in f]

In [3]:
class UnigramLanguageModel:
    def __init__(self, sentences, mode="collection", smoothing=False):

        '''
        if term not in vocab, just assign it as 'unknown', add it to vocab and normalize accordingly
            sentences: sentences of the dataset
            mode: whether this language model is for the whole corpus/collection or just a single document
            smoothing: add-one smoothing
        '''
        self.valid_modes = ['document', 'collection']
        self.smoothing = smoothing
        self.mode = None
        
        if mode not in self.valid_modes:
            print("Please use these modes:", self.valid_modes)
        else:
            self.mode = mode
            
        vocab = read_sentences_from_file('./train.vocab.txt')
        self.size_of_vocab = len(vocab)
        self.vocab = {term[0]: 0 for term in vocab}
        #self.vocab[UNK] = 0
        self.total_terms = 0
        
        for sentence in sentences:
            for term in sentence:
                if term == SENTENCE_START or term == SENTENCE_END:
                    continue
                else:
                    self.vocab[term] += 1
                    self.total_terms += 1
            
    def calculate_unigram_probability(self, word):
        '''
            calculate unigram probability of a word
        '''
        numerator = self.vocab.get(word, 0)
        denominator = self.total_terms
            
        if self.smoothing and numerator==0:
            numerator += 1
            denominator += (self.size_of_vocab + 1)
            
        if not self.smoothing and numerator==0:
            numerator += 1
            denominator += 1
            
        return numerator/denominator
    
    def calculate_sentence_probability(self, sentence, normalize_probability=True):
        '''
            calculate score/probability of a sentence or query using the unigram language model.
            sentence: input sentence or query
            normalize_probability: If true then log of probability is not computed. Otherwise take log2 of the probability score.
        '''
        if not self.mode:
            print('Wrong mode used')
            return
        
        result = 1
        for word in sentence:
            if word == SENTENCE_START or word == SENTENCE_END:
                continue
            result *= self.calculate_unigram_probability(word)
            
        if not normalize_probability:
            result = math.log2(result)
            
        return result

In [4]:
def calculate_interpolated_sentence_probability(sentence, doc, collection, alpha=0.75, normalize_probability=True):
    '''
        calculate interpolated sentence/query probability using both sentence and collection unigram models.
        sentence: input sentence/query
        doc: unigram language model a doc. HINT: this can be an instance of the UnigramLanguageModel class
        collection: unigram language model a collection. HINT: this can be an instance of the UnigramLanguageModel class
        alpha: the hyperparameter to combine the two probability scores coming from the document and collection language models.
        normalize_probability: If true then log of probability is not computed. Otherwise take log2 of the probability score.
    '''
    result = alpha*doc.calculate_sentence_probability(sentence, normalize_probability) + (1-alpha)*collection.calculate_sentence_probability(sentence, normalize_probability)
    
    return result

In [5]:
# first read the datasets
actual_dataset = read_sentences_from_file("./train.txt")
doc1_dataset = read_sentences_from_file("./doc1.txt")
doc2_dataset = read_sentences_from_file("./doc2.txt")
doc3_dataset = read_sentences_from_file("./doc3.txt")
actual_dataset_test = read_sentences_from_file("./test.txt")

'''
    Question: for each of the test queries given in test.txt, find out best matching document/doc
    according to their interpolated sentence probability.
    Optional: Extend the model to bigram language modeling.
'''
DOC_MODE = 'document'
COL_MODE = 'collection'

# with smoothing
doc1_model = UnigramLanguageModel(doc1_dataset, mode=DOC_MODE, smoothing=True)
doc2_model = UnigramLanguageModel(doc2_dataset, mode=DOC_MODE, smoothing=True)
doc3_model = UnigramLanguageModel(doc3_dataset, mode=DOC_MODE, smoothing=True)
models = [doc1_model, doc2_model, doc3_model]
collection = UnigramLanguageModel(actual_dataset, mode=COL_MODE, smoothing=True)

for query in actual_dataset_test:
    print('Query:', query)
    best_doc = None
    current_score = -np.inf
    for model in models:
        new_score = model.calculate_sentence_probability(query)
        print(new_score)
        if new_score > current_score:
            current_score = new_score
            best_doc = models.index(model)
    print('The best matching document is document', best_doc+1, 'with a probability of', current_score, '\n')

Query: ['<s>', 'the', 'website', 'and', 'monthly', 'newsletter', 'is', 'run', 'by', 'a', 'sub-committee', 'that', 'is', 'independent', 'to', 'the', 'parish', 'council', 'and', 'is', 'financed', 'through', 'selling', 'advertisement', 'space', 'to', 'local', 'businesses', '</s>']
1.4561353824759197e-81
2.0662569510312752e-75
2.0710835876900276e-74
The best matching document is document 3 with a probability of 2.0710835876900276e-74 

Query: ['<s>', 'uk', 'was', 'designed', 'and', 'built', 'by', 'chris', 'chambers', 'the', 'site', 'is', 'designed', 'in', 'a', 'way', 'that', 'when', 'content', 'is', 'added', 'to', 'the', 'site', 'no', 'previous', 'content', 'needs', 'to', 'be', 'edited', 'therefore', 'creating', 'a', 'archive', 'over', 'time', 'meaning', 'that', 'every', 'single', 'article', 'that', 'is', 'added', 'is', 'stored', 'and', 'never', 'deleted', 'allowing', 'users', 'to', 'search', 'and', 'read', 'articles', 'going', 'far', 'back', 'as', 'the', 'website', 'launch', 'date', '</s>

In [6]:
# no smoothing
doc1_model_nosmooth = UnigramLanguageModel(doc1_dataset, mode=DOC_MODE)
doc2_model_nosmooth = UnigramLanguageModel(doc2_dataset, mode=DOC_MODE)
doc3_model_nosmooth = UnigramLanguageModel(doc3_dataset, mode=DOC_MODE)
models_nosmooth = [doc1_model_nosmooth, doc2_model_nosmooth, doc3_model_nosmooth]
collection_nosmooth = UnigramLanguageModel(actual_dataset, mode=COL_MODE)

for query in actual_dataset_test:
    print('Query:', query)
    best_doc = None
    current_score = -np.inf
    for model in models_nosmooth:
        new_score = calculate_interpolated_sentence_probability(query, model, collection_nosmooth)
        print(new_score)
        if new_score > current_score:
            current_score = new_score
            best_doc = models_nosmooth.index(model)
    print('The best matching document is document', best_doc+1, 'with a probability of', current_score, '\n')

Query: ['<s>', 'the', 'website', 'and', 'monthly', 'newsletter', 'is', 'run', 'by', 'a', 'sub-committee', 'that', 'is', 'independent', 'to', 'the', 'parish', 'council', 'and', 'is', 'financed', 'through', 'selling', 'advertisement', 'space', 'to', 'local', 'businesses', '</s>']
9.921584418895225e-50
4.937726617351616e-53
2.851629716983247e-52
The best matching document is document 1 with a probability of 9.921584418895225e-50 

Query: ['<s>', 'uk', 'was', 'designed', 'and', 'built', 'by', 'chris', 'chambers', 'the', 'site', 'is', 'designed', 'in', 'a', 'way', 'that', 'when', 'content', 'is', 'added', 'to', 'the', 'site', 'no', 'previous', 'content', 'needs', 'to', 'be', 'edited', 'therefore', 'creating', 'a', 'archive', 'over', 'time', 'meaning', 'that', 'every', 'single', 'article', 'that', 'is', 'added', 'is', 'stored', 'and', 'never', 'deleted', 'allowing', 'users', 'to', 'search', 'and', 'read', 'articles', 'going', 'far', 'back', 'as', 'the', 'website', 'launch', 'date', '</s>']
4

4.743101317987167e-21
3.8175527820454026e-25
4.534620387233408e-25
The best matching document is document 1 with a probability of 4.743101317987167e-21 

Query: ['<s>', 'num', 'go', 'towards', 'lady', 'windsor', 'hospital', '</s>']
1.0754894965479627e-11
5.408428272995213e-14
4.333231590891463e-14
The best matching document is document 1 with a probability of 1.0754894965479627e-11 

Query: ['<s>', 'there', 'is', 'a', 'national', 'school', 'for', 'both', 'sexes', '</s>']
4.14926509528639e-16
1.8362454995847377e-17
2.0312029309687357e-17
The best matching document is document 1 with a probability of 4.14926509528639e-16 

Query: ['<s>', 'tendring', 'hall', 'is', 'the', 'principal', 'residence', '</s>']
2.722332788226139e-11
3.506493217023305e-12
3.488251430936962e-12
The best matching document is document 1 with a probability of 2.722332788226139e-11 

Query: ['<s>', 'the', 'birthstone', 'of', 'april', 'is', 'the', 'diamond', 'and', 'the', 'birth', 'flower', 'is', 'typically', 'listed',