In [4]:
# imports go here
import collections
import random
import sys

def read_sentences(filepath):
    """
    Reads contents of a file line by line.
    Parameters:
      filepath (str): file to read from
    Return:
      list of strings
    """
    f = open(filepath, "r")
    sentences = f.readlines()
    f.close()
    return sentences

In [2]:
def get_data_by_character(filepath):
    """
    Reads contents of a script file line by line and sorts into 
    buckets based on speaker name.
    Parameters:
      filepath (str): file to read from
    Return:
      dict of strings to list of strings, the dialogue that speaker speaks
    """
    char_data = {}
    script_file = open(filepath, "r", encoding="utf-8")
    for line in script_file:
        # extract the part between <speaker> tags
        speakers = line[line.index(
            "<speakers>") + len("<speakers>"): line.index("</speakers>")].strip()
        if not speakers in char_data:
            char_data[speakers] = []
        char_data[speakers].append(line)
    return char_data


In [302]:
class LanguageModel:
    # constants to define pseudo-word tokens
    # access via self.UNK, for instance
    UNK = "<UNK>"

    def __init__(self, n_gram, is_laplace_smoothing, line_begin="<line>", line_end="</line>"):
        """Initializes an untrained LanguageModel
        Parameters:
          n_gram (int): the n-gram order of the language model to create
          is_laplace_smoothing (bool): whether or not to use Laplace smoothing
          line_begin (str): the token designating the beginning of a line
          line_end (str): the token designating the end of a line
        """
        self.line_begin = line_begin
        self.line_end = line_end
        # your other code here
        self.n_gram = n_gram
        self.is_laplace_smoothing = is_laplace_smoothing
        self.ngram_counts = {}
        self.n_minus_one_counts = {}  # REMOVE: or None?
        self.vocab = set()
        self.model = None
        self.tokenizer = lambda x: x.split()
        self.total_count = {}

    def train(self, sentences):
        """Trains the language model on the given data. Assumes that the given data
        has tokens that are white-space separated, has one sentence per line, and
        that the sentences begin with line_begin and end with line_end
        Parameters:
          sentences (list): list of strings, one string per line in the training file

        Returns:
        None
        """

        ngram_minus_one_list = []
        ngram_list = []
        all_tokens = []

        for sentence in sentences:
            tokens = self.tokenizer(sentence)
            all_tokens += tokens
            count = collections.Counter(tokens)
            count_dict = dict(count)

            n_minus_1_gram = self.n_gram - 1
            for i in range(len(tokens) - n_minus_1_gram + 1):
                # get the tokens from current to next
                tokens_list_minus_one = tokens[i:i+n_minus_1_gram]
                n_minus_one_gram_word = " ".join(tokens_list_minus_one)
                ngram_minus_one_list.append(n_minus_one_gram_word)

            for i in range(len(tokens) - self.n_gram + 1):
                # get the tokens from current to next
                tokens_list = tokens[i:i+self.n_gram]
                ngram_word = " ".join(tokens_list)
                ngram_list.append(ngram_word)

            n_minus_one_num = dict(collections.Counter(ngram_minus_one_list))
            ngram_num = dict(collections.Counter(ngram_list))

        # for k in count_dict:
        #     if count_dict[k] == 1 and k != self.line_begin and k != self.line_end:
        #         indx = tokens.index(k)
        #         tokens[indx] = self.UNK

        print('this is tokens')
        print(tokens)

        print('this is n-1')
        print(n_minus_one_num)
        # going through the dictionary and removing the tokens that only appear once
        # get all keys that appear once
        num_iter = 0
        remove_list = []
        for k,v in n_minus_one_num.items():
            if v == 1 and k != self.line_begin and k != self.line_end:
                remove_list.append(k)
                # remove k from ngram_num
                # n_minus_one_num.pop(k)
                num_iter+=1
                # add UNK to ngram_num
        n_minus_one_num[self.UNK] = num_iter
        for k in remove_list:
            n_minus_one_num.pop(k)
    
        
        # for k in n_minus_one_num:
        #     print(k)
        #     if n_minus_one_num[k] == 1 and k != self.line_begin and k != self.line_end:
        #         # remove k from ngram_num
        #         n_minus_one_num.pop(k)
        #         # add UNK to ngram_num
        #         n_minus_one_num[self.UNK] = 1

        print('this is n')

        num_iter_ngram = 0
        remove_list_ngram = []
        print(ngram_num)
        for k,v in n_minus_one_num.items():
            print(k)
            if v == 1 and k != self.line_begin and k != self.line_end:
                print(f'tokens{tokens}')
                remove_list_ngram.append(k)
                # indx = tokens.index(k)

                # remove k from ngram_num
                # ngram_num.pop(k)
                # add UNK to ngram_num
                num_iter_ngram+=1
        ngram_num[self.UNK] = num_iter_ngram
        for k in remove_list_ngram:
            ngram_num.pop(k)

        print('updated ngram')
        print(ngram_num)
        self.n_minus_one_counts = n_minus_one_num
        self.ngram_counts = ngram_num
        self.total_count = count_dict

        # the vocab is the set of all tokens
        print(f'--------- all tokens: {all_tokens}---------')
        self.vocab = all_tokens

        print(self.ngram_counts)

    def score(self, sentence):
        """Calculates the probability score for a given string representing a single sentence.
        Parameters:
          sentence (str): a sentence with tokens separated by whitespace to calculate the score of

        Returns:
          float: the probability value of the given string for this model
        """

        score = 1
        n_minus_one_count = self.n_minus_one_counts
        ngram_count = self.ngram_counts
        tokens = sentence.split()
        total_words = len(self.vocab)
        vocab_size = len(self.total_count)

        if self.n_gram == 1:
            for word in tokens:

                if word not in ngram_count:
                    word = self.UNK
                    
                if self.is_laplace_smoothing:
                    prob = (ngram_count[word] + 1) / \
                        (total_words + vocab_size)
                else:
                    prob = ngram_count[word] / total_words
                score = score * prob
            return score

        # if self.n_gram == 2:
        else:
            print('in 2')
            for i in range(0, len(tokens)-1):
                tokens_list = tokens[i: i+self.n_gram]
                tokens_list_word = " ".join(tokens_list)

                for token in tokens_list:
                    if token not in ngram_count:
                        token = self.UNK
                        
                if tokens_list_word not in ngram_count:
                    ngram_count[tokens_list_word] = self.UNK
                    
                if tokens_list_word not in n_minus_one_count:
                    n_minus_one_count[tokens_list_word] = self.UNK

                if self.is_laplace_smoothing:
                    if (tokens_list_word) not in ngram_count:  # list of things
                        print('here')
                        
                        print(f'token list {tokens_list_word}')
                        print(f'ngrammm {ngram_count}')
                        print(f'numerator: {ngram_count[tokens_list_word] +1}')
                        print(f'denominator: {n_minus_one_count[tokens_list_word]+vocab_size}')
                        # calculate the probability of the ngram
                        prob = (ngram_count[tokens_list_word] + 1) / \
                            (n_minus_one_count[tokens_list_word] +
                             vocab_size)
                    else:
                        print(f'numerator: {ngram_count[tokens_list_word]}')
                        print(f'denominator: {n_minus_one_count[tokens_list_word]+vocab_size}')
                        prob = (ngram_count[tokens_list_word] + 1) / \
                            (n_minus_one_count[tokens_list_word] +
                             vocab_size)  # ngram + 1/ngram_mius_1+ 1+ vocab size
                else:
                    if (tokens_list_word) not in ngram_count:
                        prob = 0
                    else:
                        prob = ngram_count[tokens_list_word] / \
                            n_minus_one_count[tokens_list_word]
                score = score * prob
            return score

In [384]:
class LanguageModel:
    # constants to define pseudo-word tokens
    # access via self.UNK, for instance
    UNK = "<UNK>"

    def __init__(self, n_gram, is_laplace_smoothing, line_begin="<line>", line_end="</line>"):
        """Initializes an untrained LanguageModel
        Parameters:
          n_gram (int): the n-gram order of the language model to create
          is_laplace_smoothing (bool): whether or not to use Laplace smoothing
          line_begin (str): the token designating the beginning of a line
          line_end (str): the token designating the end of a line
        """
        self.line_begin = line_begin
        self.line_end = line_end
        # your other code here
        self.n_gram = n_gram
        self.is_laplace_smoothing = is_laplace_smoothing
        self.ngram_counts = {}
        self.n_minus_one_counts = {}  # REMOVE: or None?
        self.vocab = set()
        self.model = None
        # tokenize without nltk
        self.tokenizer = lambda x: x.split()

    def train(self, sentences):
        """Trains the language model on the given data. Assumes that the given data
        has tokens that are white-space separated, has one sentence per line, and
        that the sentences begin with line_begin and end with line_end
        Parameters:
          sentences (list): list of strings, one string per line in the training file

        Returns:
        None
        """

        ngram_minus_one_list = []
        ngram_list = []
        all_tokens = []

        for sentence in sentences:
            tokens = self.tokenizer(sentence)
            all_tokens += tokens
            count = collections.Counter(tokens)
            count_dict = dict(count)

            n_minus_1_gram = self.n_gram - 1
            for i in range(len(tokens) - n_minus_1_gram + 1):
                # get the tokens from current to next
                tokens_list_minus_one = tokens[i:i+n_minus_1_gram]
                # print('ooooooooooooooo')
                # print(f'tokens minus one {tokens_list_minus_one}')
                n_minus_one_gram_word = " ".join(tokens_list_minus_one)
                ngram_minus_one_list.append(n_minus_one_gram_word)

            for i in range(len(tokens) - self.n_gram + 1):
                # get the tokens from current to next
                tokens_list = tokens[i:i+self.n_gram]
                ngram_word = " ".join(tokens_list)
                ngram_list.append(ngram_word)

            n_minus_one_num = dict(collections.Counter(ngram_minus_one_list))
            ngram_num = dict(collections.Counter(ngram_list))

        # for k in count_dict:
        #     if count_dict[k] == 1 and k != self.line_begin and k != self.line_end:
        #         indx = tokens.index(k)
        #         tokens[indx] = self.UNK

        # print('this is tokens')
        # print(tokens)

        print('this is n-1')
        print(n_minus_one_num)
        num_keys = 0
        for k in list(n_minus_one_num.keys()):
            if n_minus_one_num[k] == 1 and k != self.line_begin and k != self.line_end:
                # remove k from ngram_num
                n_minus_one_num.pop(k)
                # add UNK to ngram_num
                num_keys += 1
        n_minus_one_num[self.UNK] = num_keys

        print('this is n')

        num_ngram_keys = 0
        print(ngram_num)
        for k in list(ngram_num.keys()):
            if ngram_num[k] == 1 and k != self.line_begin and k != self.line_end:
                print(f'tokens{tokens}')
                # indx = tokens.index(k)

                # remove k from ngram_num
                ngram_num.pop(k)
                # add UNK to ngram_num
                num_ngram_keys += 1
        ngram_num[self.UNK] = num_ngram_keys

        print('updated ngram')
        print(ngram_num)
        self.n_minus_one_counts = n_minus_one_num
        self.ngram_counts = ngram_num
        self.total_count = count_dict

        # the vocab is the set of all tokens
        print(f'--------- all tokens: {all_tokens}---------')
        self.vocab = all_tokens

        print(self.ngram_counts)

    def score(self, sentence):
        """Calculates the probability score for a given string representing a single sentence.
        Parameters:
          sentence (str): a sentence with tokens separated by whitespace to calculate the score of

        Returns:
          float: the probability value of the given string for this model
        """

        score = 1
        n_minus_one_count = self.n_minus_one_counts
        ngram_count = self.ngram_counts
        tokens = sentence.split()
        total_words = len(self.vocab)
        vocab_size = len(self.total_count)

        print(f' n minus one count {n_minus_one_count}')
        print('+++++++++++++++++++++++++')
        if self.n_gram == 1:
            for word in tokens:

                if word not in ngram_count:
                    word = self.UNK

                if self.is_laplace_smoothing:
                    prob = (ngram_count[word] + 1) / \
                        (total_words + vocab_size)
                else:
                    prob = ngram_count[word] / total_words
                score = score * prob
            return score

        # if self.n_gram == 2:
        else:
            print('in 2')
            for i in range(0, len(tokens)-1):
                tokens_list = tokens[i: i+self.n_gram]
                tokens_list_word = " ".join(tokens_list)
                tokens_list_new = tokens[i: i+self.n_gram-1][0]
                print(f'tokens lisT NEW: {tokens_list_new}')

                for token in tokens_list:
                    if token not in ngram_count:
                        token = self.UNK
                
                if tokens_list_word not in ngram_count:
                    tokens_list_word = self.UNK
                    
                if tokens_list_word not in n_minus_one_count:
                    tokens_list_word = self.UNK

                if self.is_laplace_smoothing:
                    print('in laplace')
                    if (tokens_list_word) not in ngram_count:  # list of things
                        # calculate the probability of the ngram
                        prob = (ngram_count[tokens_list_word] + 1) / \
                            (n_minus_one_count[tokens_list_word] +
                             vocab_size)
                    # if the ngram is not in the ngram count
                    else:
                        print('laplace else')
                        print(f'numerator: {ngram_count[tokens_list_word]}')
                        print(f'denomanator: {n_minus_one_count[tokens_list_word] + vocab_size}')
                        print(ngram_count[tokens_list_word])
                        print(vocab_size)
                        prob = (ngram_count[tokens_list_word] + 1) / \
                            (n_minus_one_count[tokens_list_new] +
                             vocab_size)  # ngram + 1/ngram_mius_1+ 1+ vocab size
                        print(f' n minus one count {n_minus_one_count}')
                        print(f' n count {ngram_count}')
                        print('+++++++++++++++++++++++++')  
                else:
                    if (tokens_list_word) not in ngram_count:
                        print('not laplace')
                        prob = 0
                    else:
                        print('not laplace else')
                        prob = ngram_count[tokens_list_word] / \
                            n_minus_one_count[tokens_list_word]
                score = score * prob
            return score

In [368]:
class LanguageModel:
    # constants to define pseudo-word tokens
    # access via self.UNK, for instance
    UNK = "<UNK>"

    def __init__(self, n_gram, is_laplace_smoothing, line_begin="<line>", line_end="</line>"):
        """Initializes an untrained LanguageModel
        Parameters:
          n_gram (int): the n-gram order of the language model to create
          is_laplace_smoothing (bool): whether or not to use Laplace smoothing
          line_begin (str): the token designating the beginning of a line
          line_end (str): the token designating the end of a line
        """
        self.line_begin = line_begin
        self.line_end = line_end
        # your other code here
        self.n_gram = n_gram
        self.is_laplace_smoothing = is_laplace_smoothing
        self.ngram_counts = {}
        self.n_minus_one_counts = {}  # REMOVE: or None?
        self.vocab = set()
        self.model = None
        # tokenize without nltk
        self.tokenizer = lambda x: x.split()
    def train(self, sentences):
        """Trains the language model on the given data. Assumes that the given data
        has tokens that are white-space separated, has one sentence per line, and
        that the sentences begin with line_begin and end with line_end
        Parameters:
            sentences (list): list of strings, one string per line in the training file

        Returns:
        None
        """

        ngram_minus_one_list = []
        ngram_list = []
        all_tokens = []

        for sentence in sentences:
            tokens = self.tokenizer(sentence)
            all_tokens += tokens
            count = collections.Counter(tokens)
            count_dict = dict(count)

            n_minus_1_gram = self.n_gram - 1
            for i in range(len(tokens) - n_minus_1_gram + 1):
                # get the tokens from current to next
                tokens_list_minus_one = tokens[i:i+n_minus_1_gram]
                n_minus_one_gram_word = " ".join(tokens_list_minus_one)
                ngram_minus_one_list.append(n_minus_one_gram_word)

            for i in range(len(tokens) - self.n_gram + 1):
                # get the tokens from current to next
                tokens_list = tokens[i:i+self.n_gram]
                ngram_word = " ".join(tokens_list)
                ngram_list.append(ngram_word)

            n_minus_one_num = dict(collections.Counter(ngram_minus_one_list))
            ngram_num = dict(collections.Counter(ngram_list))

        # for k in count_dict:
        #     if count_dict[k] == 1 and k != self.line_begin and k != self.line_end:
        #         indx = tokens.index(k)
        #         tokens[indx] = self.UNK

        print('this is tokens')
        print(tokens)

        print('this is n-1')
        print(n_minus_one_num)
        for k in list(n_minus_one_num.keys()):
            if n_minus_one_num[k] == 1 and k != self.line_begin and k != self.line_end:
                # remove k from ngram_num
                n_minus_one_num.pop(k)
                # add UNK to ngram_num
                n_minus_one_num[self.UNK] = 1

        print('this is n')

        print(ngram_num)
        for k in list(ngram_num.keys()):
            if ngram_num[k] == 1 and k != self.line_begin and k != self.line_end:
                print(f'tokens{tokens}')
                # indx = tokens.index(k)

                # remove k from ngram_num
                ngram_num.pop(k)
                # add UNK to ngram_num
                ngram_num[self.UNK] = 1

        print('updated ngram')
        print(ngram_num)
        self.n_minus_one_counts = n_minus_one_num
        self.ngram_counts = ngram_num
        self.total_count = count_dict

        # the vocab is the set of all tokens
        print(f'--------- all tokens: {all_tokens}---------')
        self.vocab = all_tokens

        print(self.ngram_counts)

    def score(self, sentence):
        """Calculates the probability score for a given string representing a single sentence.
        Parameters:
            sentence (str): a sentence with tokens separated by whitespace to calculate the score of

        Returns:
            float: the probability value of the given string for this model
        """

        score = 1
        n_minus_one_count = self.n_minus_one_counts
        ngram_count = self.ngram_counts
        tokens = sentence.split()
        total_words = len(self.vocab)
        vocab_size = len(self.total_count)

        if self.n_gram == 1:
            for word in tokens:

                if word not in ngram_count:
                    word = self.UNK

                if self.is_laplace_smoothing:
                    prob = (ngram_count[word] + 1) / \
                        (total_words + vocab_size)
                else:
                    prob = ngram_count[word] / total_words
                score = score * prob
            return score

        # if self.n_gram == 2:
        else:
            print('in 2')
            for i in range(0, len(tokens)-1):
                tokens_list = tokens[i: i+self.n_gram]
                tokens_list_word = " ".join(tokens_list)

                # for token in tokens_list:
                #     if token not in ngram_count:
                #         token = self.UNK

                if tokens_list_word not in ngram_count:
                    tokens_list_word = self.UNK

                # if tokens_list_word not in n_minus_one_count:
                #     tokens_list_word = self.UNK

                if self.is_laplace_smoothing:
                    if (tokens_list_word) not in ngram_count:  # list of things
                        # calculate the probability of the ngram
                        prob = (ngram_count[tokens_list_word] + 1) / \
                            (n_minus_one_count[tokens_list_word] +
                                vocab_size)
                    else:
                        prob = (ngram_count[tokens_list_word] + 1) / \
                            (n_minus_one_count[tokens_list_word] +
                                vocab_size)  # ngram + 1/ngram_mius_1+ 1+ vocab size
                else:
                    if (tokens_list_word) not in ngram_count:
                        prob = 0
                    else:
                        prob = ngram_count[tokens_list_word] / \
                            n_minus_one_count[tokens_list_word]
                score = score * prob
            return score

In [385]:
lm = LanguageModel(2, True, line_begin="<s>", line_end="</s>")
sentences = read_sentences("training_files/unknowns_mixed.txt")
lm.train(sentences)
# (0 + 1) / (2 + 6)
lm.score("<s> flamingo")

this is n-1
{'<s>': 2, 'i': 2, 'am': 2, 'sam': 2, '</s>': 2, 'today': 1}
this is n
{'<s> i': 1, 'i am': 2, 'am sam': 1, 'sam </s>': 1, '<s> sam': 1, 'sam i': 1, 'am today': 1, 'today </s>': 1}
tokens['<s>', 'sam', 'i', 'am', 'today', '</s>']
tokens['<s>', 'sam', 'i', 'am', 'today', '</s>']
tokens['<s>', 'sam', 'i', 'am', 'today', '</s>']
tokens['<s>', 'sam', 'i', 'am', 'today', '</s>']
tokens['<s>', 'sam', 'i', 'am', 'today', '</s>']
tokens['<s>', 'sam', 'i', 'am', 'today', '</s>']
tokens['<s>', 'sam', 'i', 'am', 'today', '</s>']
updated ngram
{'i am': 2, '<UNK>': 7}
--------- all tokens: ['<s>', 'i', 'am', 'sam', '</s>', '<s>', 'sam', 'i', 'am', 'today', '</s>']---------
{'i am': 2, '<UNK>': 7}
 n minus one count {'<s>': 2, 'i': 2, 'am': 2, 'sam': 2, '</s>': 2, '<UNK>': 1}
+++++++++++++++++++++++++
in 2
tokens lisT NEW: <s>
in laplace
laplace else
numerator: 7
denomanator: 7
7
6
 n minus one count {'<s>': 2, 'i': 2, 'am': 2, 'sam': 2, '</s>': 2, '<UNK>': 1}
 n count {'i am': 2, '<UNK>

1.0

In [388]:
lm = LanguageModel(2, True, line_begin="<s>", line_end="</s>")
sentences = read_sentences("training_files/unknowns_mixed.txt")
lm.train(sentences)
# (0 + 1) / (2 + 6)
lm.score("<s> flamingo")

this is n-1
{'<s>': 2, 'i': 2, 'am': 2, 'sam': 2, '</s>': 2, 'today': 1}
this is n
{'<s> i': 1, 'i am': 2, 'am sam': 1, 'sam </s>': 1, '<s> sam': 1, 'sam i': 1, 'am today': 1, 'today </s>': 1}
tokens['<s>', 'sam', 'i', 'am', 'today', '</s>']
tokens['<s>', 'sam', 'i', 'am', 'today', '</s>']
tokens['<s>', 'sam', 'i', 'am', 'today', '</s>']
tokens['<s>', 'sam', 'i', 'am', 'today', '</s>']
tokens['<s>', 'sam', 'i', 'am', 'today', '</s>']
tokens['<s>', 'sam', 'i', 'am', 'today', '</s>']
tokens['<s>', 'sam', 'i', 'am', 'today', '</s>']
updated ngram
{'i am': 2, '<UNK>': 7}
--------- all tokens: ['<s>', 'i', 'am', 'sam', '</s>', '<s>', 'sam', 'i', 'am', 'today', '</s>']---------
{'i am': 2, '<UNK>': 7}
 n minus one count {'<s>': 2, 'i': 2, 'am': 2, 'sam': 2, '</s>': 2, '<UNK>': 1}
+++++++++++++++++++++++++
in 2
tokens lisT NEW: <s>
in laplace
laplace else
numerator: 7
denomanator: 7
7
6
 n minus one count {'<s>': 2, 'i': 2, 'am': 2, 'sam': 2, '</s>': 2, '<UNK>': 1}
 n count {'i am': 2, '<UNK>

1.0

In [73]:
def train(self, sentences):
        """Trains the language model on the given data. Assumes that the given data
        has tokens that are white-space separated, has one sentence per line, and
        that the sentences begin with line_begin and end with line_end
        Parameters:
          sentences (list): list of strings, one string per line in the training file

        Returns:
        None
        """

        for sentence in sentences:
            tokens = self.tokenizer(sentence)
            ngram_list = []
            for i in range(0, len(tokens) - self.n_gram):
                # get the tokens from current to next
                tokens_list = tokens[i:i+self.n_gram]
                ngram_word = sentence = ' '.join(tokens_list)
                ngram_list.append(ngram_word)
                

            unigram_num = collections.Counter(dict(tokens))
            ngram_num = collections.Counter(dict(ngram_list))

            self.unigram_counts = unigram_num
            self.ngram_counts = ngram_num

In [3]:
def score(self, sentence):
        """Calculates the probability score for a given string representing a single sentence.
        Parameters:
          sentence (str): a sentence with tokens separated by whitespace to calculate the score of

        Returns:
          float: the probability value of the given string for this model
        """

        probability_so_far = 1
        word_probabilities = self.unigram_counts
        bigram_probabilities = self.ngram_counts
        tokens = sentence.split()
        types = len(word_probabilities)

        for i in range(0, len(tokens)-1):
            tokens_list = tokens[i+self.n_gram, i]
            current_word = tokens[i+self.n_gram]
            prev_word = tokens[i]

            if current_word not in word_probabilities:
                current_word = self.UNK
            if prev_word not in word_probabilities:
                prev_word = self.UNK
            if self.is_laplace_smoothing:
                if (tokens_list) not in bigram_probabilities:
                    prob = 1/(word_probabilities[prev_word] + types)
                else:
                    prob = (bigram_probabilities[tokens_list] + 1) / (
                        word_probabilities[prev_word] + types)
            else:
                if (tokens_list) not in bigram_probabilities:
                    prob = 0
                else:
                    prob = bigram_probabilities[tokens_list] / \
                        word_probabilities[prev_word]
            probability_so_far = probability_so_far * prob
        return probability_so_far

In [79]:
sentences = read_sentences("training_files/iamsam.txt")
LanguageModel(3, True, line_begin="<s>", line_end="</s>").train(sentences)

In [63]:
tokens

['<s>', 'sam', 'i', 'am', '</s>']

In [64]:
tokens_list

['am', '</s>']

In [65]:
ngram_list

[['<s>', 'sam', 'i'], ['sam', 'i', 'am'], ['i', 'am', '</s>'], ['am', '</s>']]

In [66]:
for sentence in sentences:
    tokens = sentence.split()

    ngram_list = []
    for i in range(0, len(tokens) - 1):
        # get the tokens from current to next
        tokens_list = tokens[i:i+3]
        ngram_list.append(tokens_list)

    unigram_num = dict(collections.Counter(tokens))
unigram_num

{'<s>': 1, 'sam': 1, 'i': 1, 'am': 1, '</s>': 1}

In [1]:
import numpy as np

class LanguageModel:
    # constants to define pseudo-word tokens
    # access via self.UNK, for instance
    UNK = "<UNK>"

    def __init__(self, n_gram, is_laplace_smoothing, line_begin="<line>", line_end="</line>"):
        """Initializes an untrained LanguageModel
        Parameters:
          n_gram (int): the n-gram order of the language model to create
          is_laplace_smoothing (bool): whether or not to use Laplace smoothing
          line_begin (str): the token designating the beginning of a line
          line_end (str): the token designating the end of a line
        """
        self.line_begin = line_begin
        self.line_end = line_end
        # your other code here
        self.n_gram = n_gram
        self.is_laplace_smoothing = is_laplace_smoothing
        self.ngram_counts = {}
        self.n_minus_one_counts = {}  # REMOVE: or None?
        self.vocab = set()
        self.model = None
        # tokenize without nltk
        self.tokenizer = lambda x: x.split()

    def train(self, sentences):
        """Trains the language model on the given data. Assumes that the given data
        has tokens that are white-space separated, has one sentence per line, and
        that the sentences begin with line_begin and end with line_end
        Parameters:
          sentences (list): list of strings, one string per line in the training file

        Returns:
        None
        """

        ngram_minus_one_list = []
        ngram_list = []
        all_tokens = []
        

        for sentence in sentences:
            tokens = self.tokenizer(sentence)
            all_tokens += tokens
            

            n_minus_1_gram = self.n_gram - 1
            for i in range(len(tokens) - n_minus_1_gram + 1):
                # get the tokens from current to next
                tokens_list_minus_one = tokens[i:i+n_minus_1_gram]
                n_minus_one_gram_word = " ".join(tokens_list_minus_one)
                ngram_minus_one_list.append(n_minus_one_gram_word)

            for i in range(len(tokens) - self.n_gram + 1):
                # get the tokens from current to next
                tokens_list = tokens[i:i+self.n_gram]
                ngram_word = " ".join(tokens_list)
                ngram_list.append(ngram_word)

            n_minus_one_num = dict(collections.Counter(ngram_minus_one_list))
            # print(f'n-1 num: `{n_minus_one_num}`')
            ngram_num = dict(collections.Counter(ngram_list))
            # print(f'n num: `{ngram_num}`')

        n_1_num_unk = 0
        
        for k in list(n_minus_one_num.keys()):
            if n_minus_one_num[k] == 1 and k != self.line_begin and k != self.line_end:
                # remove k from ngram_num
                n_minus_one_num.pop(k)
                # add UNK to ngram_num
                n_1_num_unk += 1
            n_minus_one_num[self.UNK] = n_1_num_unk

        num_unk = 0
        for k in list(ngram_num.keys()):
            if ngram_num[k] == 1 and k != self.line_begin and k != self.line_end:
                # remove k from ngram_num
                ngram_num.pop(k)
                # add UNK to ngram_num
                num_unk += 1
            ngram_num[self.UNK] = num_unk
        
        if ngram_num[self.UNK] == 0:
            ngram_num.pop(self.UNK)
            
        if n_minus_one_num[self.UNK] == 0:
            n_minus_one_num.pop(self.UNK)

        count = collections.Counter(tokens)
        count_dict = dict(count)
        self.n_minus_one_counts = n_minus_one_num
        self.ngram_counts = ngram_num
        self.total_count = count_dict

        # the vocab is the set of all tokens
        self.vocab = all_tokens

    def score(self, sentence):
        """Calculates the probability score for a given string representing a single sentence.
        Parameters:
          sentence (str): a sentence with tokens separated by whitespace to calculate the score of

        Returns:
          float: the probability value of the given string for this model
        """

        score = 1
        n_minus_one_count = self.n_minus_one_counts
        ngram_count = self.ngram_counts
        tokens = sentence.split()
        total_words = sum(self.ngram_counts.values())
        vocab_size = len(self.n_minus_one_counts)
        # print(f'total count is : {self.total_count}')
        print(f'ngram count is : {self.ngram_counts}')
        
        if self.n_gram == 1:
            # vocab_size = len(self.vocab)
            vocab_size = len(self.ngram_counts)
            
            for word in tokens:

                if word not in ngram_count:
                    word = self.UNK

                if self.is_laplace_smoothing:
                    # print(f'numerator: {ngram_count[word] + 1}')
                    # print(f'denominator: {total_words + vocab_size}')
                    # print(f'total word: {total_words}')
                    # print(f'vocab size: {vocab_size}')
                    
                    prob = (ngram_count[word] + 1) / \
                        (total_words + vocab_size)
                else:
                    # print(f'numerator: {ngram_count[word]}')
                    # print(f'deno: {total_words}')
                    prob = ngram_count[word] / total_words
                score = score * prob
            return score

        else:
            print('in 2')
            for i in range(0, len(tokens)-1):
                tokens_list = tokens[i: i+self.n_gram]
                tokens_list_new = tokens[i: i+self.n_gram-1][0]
                tokens_list_word = " ".join(tokens_list)

                # if tokens_list_word not in ngram_count:
                #     tokens_list_word = self.UNK

                if self.is_laplace_smoothing:
                    if (tokens_list_word) not in ngram_count:
                        # calculate the probability of the ngram
                        # print('in laplace if')
                        # print(f'numerator: {ngram_count.get(tokens_list_word, 0) + 1}')
                        # print(f'denominator: {n_minus_one_count.get(tokens_list_word, 0) +vocab_size}')
                        # print(f'nminusonecount tokenlistword: {n_minus_one_count.get(tokens_list_word, 0)}')
                        # print(f'vocab size: {vocab_size}')
                        prob = (ngram_count.get(tokens_list_word, 0) + 1) / \
                            (n_minus_one_count.get(tokens_list_new, 0) +
                             vocab_size)
                        print(f'the probability is: {prob}')
                    else:
                        print('in laplace else')
                        # print(f'numerator: {ngram_count[tokens_list_word] + 1}')
                        # print(f'denominator: {n_minus_one_count[tokens_list_new] + vocab_size}')
                        # print(f'{n_minus_one_count[tokens_list_new] } n-1 token list')
                        # print(f'vocab size: {vocab_size}')
                        # print(f'ngram count: {ngram_count}')
                        # print(f'ngram_minus_1 count: {n_minus_one_count}')
                        # this works if I remove the +1 -- but i need to keep it because of laplace smoothing
                        prob = (ngram_count.get(tokens_list_word, 0) + 1) / \
                            (n_minus_one_count.get(tokens_list_new, 0) +
                             vocab_size)
                    print(prob)
                    # score = score * prob
                else:
                    if (tokens_list_word) not in ngram_count:
                        prob = 0
                    else:
                        print(f'{ n_minus_one_count.get(tokens_list_new)} : ')
                        
                        print('in else')
                        prob = ngram_count[tokens_list_word] / \
                            n_minus_one_count[tokens_list_new]
                score *= prob       
                
                # print(score)
            return score
        
    def generate_sentence(self):
        """Generates a single sentence from a trained language model using the Shannon technique.

        Returns:
          str: the generated sentence
        """


        # check if begin is in ngram_dict
        begin = self.line_begin
        if begin in ngram_dict:
        
            if self.n_gram == 1:
                
                # getting the probability list
                unigram_dict = self.ngram_counts
                # getting the begining value
                begin = self.line_begin
                # setting the current token to begin
                current_token = begin
                sentence = begin
                
                # storing the value to add back later
                beg_val = unigram_dict[begin]
                # remove line_begin from unigram_dict
                unigram_dict.pop(self.line_begin)
                
                # create a probability list
                prob_list = []

                # calculate the probability of each unigram
                for unigram in unigram_dict:
                    prob = self.score(unigram)
                    # storing it in the list
                    prob_list.append(prob)
                
                # converting the dictionary to a list to use in the random function
                unigram_list = list(unigram_dict.keys())
                
                # sampling the unigram list based on the probability list without the begining token
                while current_token != self.line_end:
                    # using np.random.choice as it is much faster than random.choices
                    current_token = np.random.choice(unigram_list, p=prob_list)
                    # adding the randomly picked current token to the sentence
                    sentence = sentence + " " + current_token
                
                # adding back the value of line_begin to the unigram_dict 
                # so it is not mutated for future calls            
                unigram_dict[self.line_begin] = beg_val
                return sentence

            else:
                # getting the probability list
                # n_minus_1_gram_dict = self.n_minus_one_counts
                ngram_dict = self.ngram_counts

                # getting the begining value and setting it as the starting value
                begin = self.line_begin
                
                sentence = begin

                current_token = self.line_begin
            
                beg_val = ngram_dict[begin]
                # remove line_begin from unigram_dict
                ngram_dict.pop(self.line_begin)
                
                # creating a probability list
                prob_list = []
                
                # calculating the probability of each n minus 1 gram
                for n_gram in ngram_dict:
                    prob = self.score(n_gram)
                    prob_list.append(prob)
                    
                # converting the dictionary to a list to use in the random function
                n_gram_list = list(ngram_dict.keys())
                
                # creating a counter for line end tokens
                line_end_tks = 0 
                
                while line_end_tks != (self.n_gram - 1):
                    current_token = np.random.choice(n_gram_list, p=prob_list)
                    if current_token == self.line_end:
                        line_end_tks += 1
                    sentence = sentence + " " + current_token
                    
                
                # adding back the value of line_begin to the unigram_dict 
                # so it is not mutated for future calls   
                ngram_dict[self.line_begin] = beg_val
                
                # tacking on the begin tokens to the beginning of the sentence
                for i in range(0, self.n_gram - 1):
                    line_begins = self.line_begin
                    sentence = line_begins + " " + sentence
                
                return sentence
        else:
            if self.n_gram == 1:
                sentence = '<s> </s>'
            else:
                line_begins = self.line_begin
                line_end = self.line_end
                sentence = line_begins * (self.n_gram - 1) + " " + line_end * (self.n_gram - 1)

    def generate(self, n):
        """Generates n sentences from a trained language model using the Shannon technique.
        Parameters:
            n (int): the number of sentences to generate

        Returns:
            list: a list containing strings, one per generated sentence
        """
        sentences = []

        for i in range(0, n):
            new_sentence = self.generate_sentence()
            print(f'thus the new sentence is: {new_sentence}')
            sentences.append(new_sentence)
        print(f'thus the senteces are: {sentences}')

        return sentences

In [5]:
lm = LanguageModel(1, True, line_begin="<s>", line_end="</s>")
sentences = read_sentences("training_files/unknowns.txt")
lm.train(sentences)

# sentences should only contain unk tokens
sents = lm.generate(5)

UnboundLocalError: local variable 'ngram_dict' referenced before assignment

# and then for bigrams
lm = LanguageModel(2, True, line_begin="<s>", line_end="</s>")
sentences = read_sentences("training_files/unknowns.txt")
lm.train(sentences)
score('UNK')

# sentences should only contain unk tokens
sents = lm.generate(5)

In [554]:
lm = LanguageModel(1, True, line_begin="<s>", line_end="</s>")
sentences = read_sentences("training_files/unknowns.txt")
lm.train(sentences)

# sentences should only contain unk tokens
# sents = lm.generate(5)
# print(sents)
# for sent in sents:
#     words = sent.split()
#     if len(words) > 2:
#         for word in words[1:-1]:
#             word.upper()

# # probability of unk should be v high
# score = lm.score("porcupine")
# # (6 + 1) / (10 + 3)
# score

# # and then for bigrams
# lm = LanguageModel(2, True, line_begin="<s>", line_end="</s>")
# sentences = read_sentences("training_files/unknowns.txt")
# lm.train(sentences)

# # sentences should only contain unk tokens
# sents = lm.generate(5)
# for sent in sents:
#     words = sent.split()
#     if len(words) > 2:
#         for word in words[1:-1]:
#             word.upper()

# # probability of unk should be v high
# score = lm.score("porcupine wombat")
# # (4 + 1) / (6 + 3)
# score

total count is : {'<s>': 1, 'goose': 1, 'or': 1, 'moose': 1, '</s>': 1}
in 2
in laplace if
numerator: 1
denominator: 3
nminusonecount tokenlistword: 0
vocab size: 3
the probability is: 0.3333333333333333
0.3333333333333333


0.3333333333333333

In [None]:
 python test_minitrainingprovided.py 
$ python lm_friends.py 1 training_files/friends_train.txt training_files/friends_test.txt line 
$ python lm_friends.py 2 training_files/friends_train.txt training_files/friends_test.txt line 
$ python lm_friends.py 1 training_files/friends_train.txt training_files/friends_test.txt line character

In [546]:
a = ((2 + 1) / (4 + 6)) * ((4 + 1) / (4 + 6)) * ((2 + 1) / (4 + 6))
a == b

True

In [488]:
lm.ngram_counts

{'<s>': 2, 'i': 2, 'am': 2, 'sam': 2, '</s>': 2, '<UNK>': 1}

In [486]:
lm = LanguageModel(1, True, line_begin="<s>", line_end="</s>")
sentences = read_sentences("training_files/iamsam2.txt")
lm.train(sentences)
sents = lm.generate(2)
len(sents)
print(sents)

n-1 num: `{'': 6}`
n num: `{'<s>': 1, 'i': 1, 'am': 1, 'sam': 1, '</s>': 1}`
n-1 num: `{'': 12}`
n num: `{'<s>': 2, 'i': 2, 'am': 2, 'sam': 2, '</s>': 2}`
n-1 num: `{'': 18}`
n num: `{'<s>': 3, 'i': 3, 'am': 3, 'sam': 2, '</s>': 3, 'ham': 1}`
n-1 num: `{'': 24}`
n num: `{'<s>': 4, 'i': 4, 'am': 4, 'sam': 2, '</s>': 4, 'ham': 2}`
total count is : {'<s>': 1, 'ham': 1, 'i': 1, 'am': 1, '</s>': 1}
numerator: 3
denominator: 25
total count is : {'<s>': 1, 'ham': 1, 'i': 1, 'am': 1, '</s>': 1}
numerator: 5
denominator: 25
total count is : {'<s>': 1, 'ham': 1, 'i': 1, 'am': 1, '</s>': 1}
numerator: 3
denominator: 25
total count is : {'<s>': 1, 'ham': 1, 'i': 1, 'am': 1, '</s>': 1}
numerator: 5
denominator: 25
total count is : {'<s>': 1, 'ham': 1, 'i': 1, 'am': 1, '</s>': 1}
numerator: 5
denominator: 25
total count is : {'<s>': 1, 'ham': 1, 'i': 1, 'am': 1, '</s>': 1}
numerator: 5
denominator: 25
['ham', 'i', 'sam', '<s>', '</s>', 'am']
[0.12, 0.2, 0.12, 0.2, 0.2, 0.2]


ValueError: probabilities do not sum to 1

In [469]:
lm = LanguageModel(2, True, line_begin="<s>", line_end="</s>")
sentences = read_sentences("training_files/iamsam2.txt")
lm.train(sentences)
# (2 + 1) / (4 + 6)
lm.score("<s> i")
# ((2 + 1) / (4 + 6)) * ((4 + 1) / (4 + 6)) * ((2 + 1) / (4 + 6))
lm.score("<s> i am </s>")

n-1 num: `{'<s>': 1, 'i': 1, 'am': 1, 'sam': 1, '</s>': 1}`
n num: `{'<s> i': 1, 'i am': 1, 'am sam': 1, 'sam </s>': 1}`
n-1 num: `{'<s>': 2, 'i': 2, 'am': 2, 'sam': 2, '</s>': 2}`
n num: `{'<s> i': 1, 'i am': 2, 'am sam': 1, 'sam </s>': 1, '<s> sam': 1, 'sam i': 1, 'am </s>': 1}`
n-1 num: `{'<s>': 3, 'i': 3, 'am': 3, 'sam': 2, '</s>': 3, 'ham': 1}`
n num: `{'<s> i': 2, 'i am': 3, 'am sam': 1, 'sam </s>': 1, '<s> sam': 1, 'sam i': 1, 'am </s>': 1, 'am ham': 1, 'ham </s>': 1}`
n-1 num: `{'<s>': 4, 'i': 4, 'am': 4, 'sam': 2, '</s>': 4, 'ham': 2}`
n num: `{'<s> i': 2, 'i am': 4, 'am sam': 1, 'sam </s>': 1, '<s> sam': 1, 'sam i': 1, 'am </s>': 2, 'am ham': 1, 'ham </s>': 1, '<s> ham': 1, 'ham i': 1}`
total count is : {'<s>': 1, 'ham': 1, 'i': 1, 'am': 1, '</s>': 1}
in 2
in laplace else
numerator: 3
denominator: 10
4 n-1 token list
vocab size: 6
ngram count: {'<s> i': 2, 'i am': 4, 'am </s>': 2, '<UNK>': 8}
ngram_minus_1 count: {'<s>': 4, 'i': 4, 'am': 4, 'sam': 2, '</s>': 4, 'ham': 2}
tota

0.004629629629629629

In [402]:
lm = LanguageModel(2, False, line_begin="<s>", line_end="</s>")
sentences = read_sentences("training_files/unknowns_mixed.txt")
lm.train(sentences)
# ((0) / (2))
lm.score("<s> flamingo")

in 2
in else


1.0

In [424]:
lm = LanguageModel(2, True, line_begin="<s>", line_end="</s>")
sentences = read_sentences("training_files/unknowns_mixed.txt")
lm.train(sentences)
# (0 + 1) / (2 + 6)
lm.score("<s> flamingo")

in 2
in laplace if
numerator: 1
denominator: 6


0.125

In [431]:
lm = LanguageModel(2, False, line_begin="<s>", line_end="</s>")
sentences = read_sentences("training_files/iamsam2.txt")
lm.train(sentences)
# (2) / (4)
lm.score("<s> i")
# (2 / 4) * (4 / 4) * (2 / 4)
lm.score("<s> i am </s>")

in 2
4 : 
in else
in 2
4 : 
in else
4 : 
in else
4 : 
in else


0.25