In [None]:
# import os
# os.kill(os.getpid(), 9)

In [None]:
# Get data
with open('/content/drive/MyDrive/Colab Notebooks/NLP/data/NLP_Dataset2.txt', 'r') as f:
  text = f.read().replace('\n', '')

print(text)

The weather was cold and gray as usual at this time of year. The trees were all leafless, with fall now just a memory. Christmas was just a few weeks away, and all the kids were looking forward to staying home from school for a few weeks and to the “big payoff” on Christmas morning. Not having to go to school was good, but usually by the time vacation was over, going to school was a big relief. Back to the friends to compare “loot” from Christmas and, to reestablish those fragile ties that hold kids together. At school, students were praised for doing good work, not belittled for each and every mistake. No one there was fighting, and being too loud was against the rules. Right now, the world outside of home was more safe and structured, not chaotic, scary and loud. Even when bad things did happen, it was always far away and nothing to be too concerned about. With Dad often having too much to drink, and Mom just mad at everyone all the time, being home was not usually a very pleasant ex

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import io, sys, math, re
from collections import defaultdict
import numpy as np

In [None]:
# Tokenize sentences
from nltk.tokenize import sent_tokenize, word_tokenize

sentences = sent_tokenize(text)
print(sentences)

['The weather was cold and gray as usual at this time of year.', 'The trees were all leafless, with fall now just a memory.', 'Christmas was just a few weeks away, and all the kids were looking forward to staying home from school for a few weeks and to the “big payoff” on Christmas morning.', 'Not having to go to school was good, but usually by the time vacation was over, going to school was a big relief.', 'Back to the friends to compare “loot” from Christmas and, to reestablish those fragile ties that hold kids together.', 'At school, students were praised for doing good work, not belittled for each and every mistake.', 'No one there was fighting, and being too loud was against the rules.', 'Right now, the world outside of home was more safe and structured, not chaotic, scary and loud.', 'Even when bad things did happen, it was always far away and nothing to be too concerned about.', 'With Dad often having too much to drink, and Mom just mad at everyone all the time, being home was n

In [None]:
# Process data
for i in range(len(sentences)):
  sentences[i] = sentences[i].replace(',',' ,')
  sentences[i] = sentences[i].replace('.',' .')
  sentences[i] = sentences[i].replace('“',' " ')
  sentences[i] = sentences[i].replace('”', ' " ')
  sentences[i] = sentences[i].lower()
  sentences[i] = '<s> ' + sentences[i] + ' </s>'

print(sentences)

['<s> the weather was cold and gray as usual at this time of year . </s>', '<s> the trees were all leafless , with fall now just a memory . </s>', '<s> christmas was just a few weeks away , and all the kids were looking forward to staying home from school for a few weeks and to the  " big payoff "  on christmas morning . </s>', '<s> not having to go to school was good , but usually by the time vacation was over , going to school was a big relief . </s>', '<s> back to the friends to compare  " loot "  from christmas and , to reestablish those fragile ties that hold kids together . </s>', '<s> at school , students were praised for doing good work , not belittled for each and every mistake . </s>', '<s> no one there was fighting , and being too loud was against the rules . </s>', '<s> right now , the world outside of home was more safe and structured , not chaotic , scary and loud . </s>', '<s> even when bad things did happen , it was always far away and nothing to be too concerned about 

In [None]:
# Function to tokenize words in sentences
def tokenize_word(sentences):
  sentences_tokenized = []
  for sentence in sentences:
    words = sentence.split()
    sentences_tokenized.append(words)
  return sentences_tokenized


In [None]:
# Tokenize words in sentences
sentences = tokenize_word(sentences)

In [None]:
# Function to count the amount of word appeared
from collections import defaultdict
def get_count(sentences):
  vocab = defaultdict(lambda:0)

  for sentence in sentences:
    for word in sentence:
      vocab[word] += 1

  return vocab

In [None]:
# Count word
word_count = get_count(sentences)

In [None]:
word_count['.it']

1

In [None]:
# Split data into 3
train_size = int(len(sentences)*0.7)
valid_size = int(len(sentences)*0.1)

train = sentences[:train_size]
valid = sentences[train_size:train_size+valid_size]
test = sentences[train_size+valid_size:]

print(len(train), len(valid), len(test))

237 34 69


In [None]:
def build_ngram_backoff(data, n):
    '''
    Parameters:
    data (list of lists): each list is a sentence of the text
    n (int): size of the n-gram

    Returns:
    proba (dictionary of dictionary)
    {
        context: {word:probability of this word given context}
    }


    '''
    total_number_words = 0
    counts = defaultdict(lambda: defaultdict(lambda: 0.0))

    for sentence in data:
      sentence = tuple(sentence)
      ## FILL CODE
      # dict can be indexed by tuples
      # store in the same dict all the ngrams
      # by using the context as a key and the word as a value
      for i in range(len(sentence)):
        total_number_words +=1
        for k in range(n):
          if i-k < 0:
            break
          counts[sentence[i-k:i]][sentence[i]] +=1

    proba  = defaultdict(lambda: defaultdict(lambda: 0.0))
    # Build the probabilities from the counts
    # Be careful with how you normalize!

    for context in counts.keys():
    ## FILL CODE
      denom =0
      for w in counts[context].keys():
        denom += counts[context][w]
      for w in counts[context].keys():
        proba[context][w] = counts[context][w]/denom

    return proba

In [None]:
def build_ngram_interpolation(data, n, k, lambda_values):
    '''
    Parameters:
    data (list of lists): each list is a sentence of the text
    n (int): size of the n-gram
    k (float): smoothing parameter for add-k smoothing
    lambda_values (list): interpolation weights for different n-gram orders

    Returns:
    proba (dictionary of dictionary)
    {
        context: {word: probability of this word given context}
    }
    '''

    total_number_words = 0
    counts = defaultdict(lambda: defaultdict(lambda: 0.0))

    for sentence in data:
        sentence = tuple(sentence)
        for i in range(len(sentence)):
            total_number_words += 1
            for k in range(n):
                if i - k < 0:
                    break
                counts[sentence[i - k:i]][sentence[i]] += 1

    proba = defaultdict(lambda: defaultdict(lambda: 0.0))

    for context in counts.keys():
        denom = sum(counts[context].values()) + k * len(set([word for context in counts.keys() for word in context]))
        for w in counts[context].keys():
            proba[context][w] = (counts[context][w] + k) / denom

    # Interpolation
    for context in counts.keys():
        for i in range(1, n + 1):
            sub_context = context[-i:]
            lambda_i = lambda_values[i - 1]
            denom_i = sum(counts[sub_context].values()) + k * len(set([word for sub_context in counts.keys() for word in sub_context]))
            for w in counts[sub_context].keys():
                proba[context][w] += lambda_i * (counts[sub_context][w] + k) / denom_i

    return proba


In [None]:
n = 4
print("build ngram model with n = ", n)
model = build_ngram_backoff(train, n)

build ngram model with n =  4


In [None]:
n = 4
print("build ngram model with n = ", n)
model = build_ngram_interpolation(train, n, 0.2, [0.1, 0.2, 0.3, 0.4])

build ngram model with n =  4


In [None]:
def pretty_print_nested_dict(d, indent=0):
    for key, value in d.items():
        if isinstance(value, defaultdict):
            print("  " * indent + f"{key}:")
            pretty_print_nested_dict(value, indent + 1)
        else:
            print("  " * indent + f"{key}: {value}")

def print_all_key_value_pairs(d, indent=0, prefix=""):
    for key, value in d.items():
        if isinstance(value, defaultdict):
            print_all_key_value_pairs(value, indent + 1, prefix + f"{key}.")
        else:
            print("  " * indent + f"{prefix}{key}: {value}")

In [None]:
def get_prob(model, context, w):
    '''
    Parameters:
    model (dictionary of dictionary)
    {
        context: {word:probability of this word given context}
    }
    context (list of strings): a sentence
    w(string): the word we need to find it's probability given the context

    Retunrs:
    prob(float): probability of this word given the context
    '''

    # code a recursive function over
    # smaller and smaller context
    # to compute the backoff model

    ## FILL CODE

    if context in model and w in model[context]:
        return model[context][w]
    else:
        return 0.4*get_prob(model, context[1:], w)

In [None]:
def perplexity(model, data, n):
    '''
    Parameters:
    model (dictionary of dictionary)
    {
        context: {word:probability of this word given context}
    }
    data (list of lists): each list is a sentence of the text
    n(int): size of the n-gram

    Retunrs:
    prep(float): the preplexity of the model
    '''
    ## FILL CODE
    perp, T = 0.0, 0
    for sentence in data:
        sentence = tuple(sentence)
        for i in range(1, len(sentence)):
            nc = min(n-1, i)
            context = sentence[i-nc:i]
            perp += -math.log(get_prob(model, context, sentence[i]))
            T += 1
    perp = math.exp(perp/T)
    return perp

In [None]:
print("The perplexity is", perplexity(model, valid, n=2))

RecursionError: ignored

In [None]:
def get_proba_distrib(model, context):
    ## need to get the the words after the context and their probability of appearance
    ## after this context
    '''
    Parameters:
    model (dictionary of dictionary)
    {
        context: {word:probability of this word given context}
    }
    context (list of strings): the sentence we need to find the words after it and
    thier probabilites

    Retunrs:
    words_and_probs(dic): {word: probability of word given context}

    '''
    # code a recursive function over context
    # to find the longest available ngram

    ## FILL CODE

    if context in model:
        return model[context]
    else:
        return get_proba_distrib(model, context[1:])

In [None]:
def generate(model, context):
    '''
    Parameters:
    model (dictionary of dictionary)
    {
        context: {word:probability of this word given context}
    }

    Retunrs:
    sentence (list of strings): a sentence sampled according to the language model.


    '''
    # generate a sentence. A sentence starts with a <s> and ends with a </s>
    # Possiblly a use function is:
    # np.random.choice(x, 1, p = y)

    # where x is a list of things to sample from
    # and y is a list of probability (of the same length as x)
    sentence = ["<s>"]+context
    print(type(sentence))
    print(sentence)
    while sentence[-1] != "</s>" and len(sentence)<100:
        ## FILL CODE
        proba = get_proba_distrib(model, tuple(sentence))
        w = np.random.choice((list(proba.keys())), 1, p = list(proba.values()))
        sentence.append(w[0])

    result = ''
    for word in sentence:
        result = result + ' ' + word
    return result

In [None]:
print("Generated sentence: ",generate(model, ['i', 'was', 'just']))

<class 'list'>
['<s>', 'i', 'was', 'just']
Generated sentence:   <s> i was just a few weeks and to the " big payoff " on christmas morning . </s>


In [None]:
sesss = ["<s>", "i"]
ppppp = get_proba_distrib(model, tuple(sesss))
ppppp.keys()

dict_keys(['was', 'can', 'look', 'swallow', 'had', 'feel', 'am', 'want'])

#Break

In [None]:
tokens = word_tokenize(' '.join(train))
print(tokens)
print(len(tokens))

['The', 'weather', 'was', 'cold', 'and', 'gray', 'as', 'usual', 'at', 'this', 'time', 'of', 'year', '.', 'The', 'trees', 'were', 'all', 'leafless', ',', 'with', 'fall', 'now', 'just', 'a', 'memory', '.', 'Christmas', 'was', 'just', 'a', 'few', 'weeks', 'away', ',', 'and', 'all', 'the', 'kids', 'were', 'looking', 'forward', 'to', 'staying', 'home', 'from', 'school', 'for', 'a', 'few', 'weeks', 'and', 'to', 'the', '“', 'big', 'payoff', '”', 'on', 'Christmas', 'morning', '.', 'Not', 'having', 'to', 'go', 'to', 'school', 'was', 'good', ',', 'but', 'usually', 'by', 'the', 'time', 'vacation', 'was', 'over', ',', 'going', 'to', 'school', 'was', 'a', 'big', 'relief', '.', 'Back', 'to', 'the', 'friends', 'to', 'compare', '“', 'loot', '”', 'from', 'Christmas', 'and', ',', 'to', 'reestablish', 'those', 'fragile', 'ties', 'that', 'hold', 'kids', 'together', '.', 'At', 'school', ',', 'students', 'were', 'praised', 'for', 'doing', 'good', 'work', ',', 'not', 'belittled', 'for', 'each', 'and', 'every

In [None]:
from collections import Counter

word_freq = Counter(tokens).most_common(100)

print(len(word_freq))

vocab = [w for w, _ in word_freq]
print(vocab)

100
['the', ',', '.', 'of', 'a', 'to', 'and', 'was', 'in', 'I', 'on', 'that', 'at', 'his', 'is', 'as', 'with', 'by', 'The', 'their', 'out', 'my', 'or', 'for', 'from', 'have', 'one', 'it', 'her', 'be', 'like', 'are', 'he', 'all', '?', 'would', 'an', 'but', 'up', 'you', 'this', 'over', 'not', 'He', 'home', 'him', 'she', 'children', 'me', 'were', 'down', 'back', 'there', 'been', '``', 'ride', 'time', 'Mom', 'who', 'off', 'had', 'It', 'them', 'into', 'girl', 'too', 'about', 'house', 'our', 'people', 'small', 'other', 'even', 'door', 'around', 'A', "'s", 'away', '“', '”', 'being', 'Dad', 'room', 'its', 'through', 'As', 'face', 'corner', 'just', 'each', 'outside', 'more', 'day', 'along', 'little', 'where', 'almost', 'we', 'could', 'if']


In [None]:
new_tokens = [_ if _ in vocab else '<unk>' for _ in tokens]
print(new_tokens)

['The', '<unk>', 'was', '<unk>', 'and', '<unk>', 'as', '<unk>', 'at', 'this', 'time', 'of', '<unk>', '.', 'The', '<unk>', 'were', 'all', '<unk>', ',', 'with', '<unk>', '<unk>', 'just', 'a', '<unk>', '.', '<unk>', 'was', 'just', 'a', '<unk>', '<unk>', 'away', ',', 'and', 'all', 'the', '<unk>', 'were', '<unk>', '<unk>', 'to', '<unk>', 'home', 'from', '<unk>', 'for', 'a', '<unk>', '<unk>', 'and', 'to', 'the', '“', '<unk>', '<unk>', '”', 'on', '<unk>', '<unk>', '.', '<unk>', '<unk>', 'to', '<unk>', 'to', '<unk>', 'was', '<unk>', ',', 'but', '<unk>', 'by', 'the', 'time', '<unk>', 'was', 'over', ',', '<unk>', 'to', '<unk>', 'was', 'a', '<unk>', '<unk>', '.', '<unk>', 'to', 'the', '<unk>', 'to', '<unk>', '“', '<unk>', '”', 'from', '<unk>', 'and', ',', 'to', '<unk>', '<unk>', '<unk>', '<unk>', 'that', '<unk>', '<unk>', '<unk>', '.', '<unk>', '<unk>', ',', '<unk>', 'were', '<unk>', 'for', '<unk>', '<unk>', '<unk>', ',', 'not', '<unk>', 'for', 'each', 'and', '<unk>', '<unk>', '.', '<unk>', 'one'

In [None]:
from nltk.util import ngrams
fourgrams = list(ngrams(new_tokens, 4))

In [None]:
from collections import defaultdict, Counter

# Assuming trigrams is a list of 4-grams
fourgram_count = defaultdict(Counter)

for fourgram in fourgrams:
    # Assuming fourgram is a tuple (word1, word2, word3, word4)
    key = (fourgram[0], fourgram[1], fourgram[2])
    value = fourgram[3]
    fourgram_count[key][value] += 1


In [None]:
fourgram_count

defaultdict(collections.Counter,
            {('The', '<unk>', 'was'): Counter({'<unk>': 2, 'a': 1}),
             ('<unk>',
              'was',
              '<unk>'): Counter({'and': 3,
                      ',': 2,
                      'the': 1,
                      'to': 1,
                      'on': 1,
                      '.': 1,
                      'out': 3,
                      '<unk>': 4,
                      'for': 1,
                      'like': 1,
                      'in': 1,
                      'with': 1,
                      'into': 1,
                      'by': 1,
                      'a': 1}),
             ('was', '<unk>', 'and'): Counter({'<unk>': 2, 'he': 1}),
             ('<unk>',
              'and',
              '<unk>'): Counter({'as': 2,
                      ',': 6,
                      '.': 6,
                      'to': 5,
                      'in': 2,
                      'where': 1,
                      'the': 2,
                      

In [None]:
a = (3)
list(a[1:])

TypeError: ignored