1 - **process_data**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import io, sys, math, re

from collections import defaultdict
from nltk.probability import FreqDist
from typing import Sequence
from nltk.util import ngrams

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

`read text files`

In [44]:
# data_loader
def load_data(filename):
    '''
    parameters:
    filename (string): datafile

    Returns:
    data (list of lists): each list is a sentence of the text
    vocab (dictionary): {word: no of times it appears in the text}
    '''
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    vocab = defaultdict(lambda:0)
    for line in fin:
        sentence = line.split()
        data.append(sentence)
        for word in sentence:
            vocab[word] += 1
    return data, vocab

In [49]:
print("load training set..")
print("\n")
train_data, vocab = load_data("/content/drive/MyDrive/Master (MCS) 2023-2024/Course/Natural Language Processing/Mini project/shakespeare.txt")
print(train_data[0])
print("\n")
print("vocab :",vocab['Muse'])
# print("load validation set")
# valid_data, _ = load_data("valid1.txt")

load training set..


['O', 'for', 'a', 'Muse', 'of', 'fire,', 'that', 'would', 'ascend']


vocab : 4


In [50]:
def remove_rare_words(data, vocab, mincount = 1):
    '''
    Parameters:
    data (list of lists): each list is a sentence of the text
    vocab (dictionary): {word: no of times it appears in the text}
    mincount(int): the minimum count

    Returns:
    data_with_unk(list of lists): data after replacing rare words with <unk> token
    '''
    # replace words in data that are not in the vocab
    # or have a count that is below mincount
    data_with_unk = []
    ## FILL CODE
    for sentence in data:
        sentence_with_unk = []
        for word in sentence:
            if word in vocab and vocab[word]>mincount:
                sentence_with_unk.append(word)
            else:
                sentence_with_unk.append('<unk>')
        data_with_unk.append(sentence_with_unk)

    return data_with_unk


In [51]:
print("remove rare words")
train_data = remove_rare_words(train_data, vocab, mincount = 1)

remove rare words


In [52]:

#N-gram
#Model 1

def build_ngram(data, n):
  total_number_words = 0
  counts = defaultdict(lambda: defaultdict(lambda: 0.0))
  for sentence in data:
    sentence = tuple(sentence)
    for i in range(len(sentence)):
      total_number_words += 1
      for k in range(n):
        if i-k < 0:
          break
        counts[sentence[i-k:i]][sentence[i]] += 1
  pro = defaultdict(lambda: defaultdict(lambda: 0.0))
  for context in counts.keys():
    denom = 0
    for w in counts[context].keys():
      denom += counts[context][w]
    for w in counts[context].keys():
      pro[context][w] = counts[context][w]/denom
  return pro
n_gram = 4
print("build ngram model with n = ", n_gram)
LM1 = build_ngram(train_data, n_gram)
LM1

build ngram model with n =  4


defaultdict(<function __main__.build_ngram.<locals>.<lambda>()>,
            {(): defaultdict(<function __main__.build_ngram.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'O': 0.0014953701041007649,
                          'for': 0.006077337474358237,
                          'a': 0.012576446003719254,
                          'Muse': 7.668564636414179e-05,
                          'of': 0.018577097831713348,
                          'fire,': 0.00013419988113724815,
                          'that': 0.010122505320066717,
                          'would': 0.0023389122141063245,
                          '<unk>': 0.1323402542129177,
                          'The': 0.005042081248442323,
                          'brightest': 3.8342823182070895e-05,
                          'heaven': 0.00046011387818485074,
                          'invention,': 3.8342823182070895e-05,
                          'A': 0.0016103985736469777,
                          'kingdom': 9

In [53]:
#n-gram
#Model 2 with smoothing
def build_ngram_smooth(data, n):
    total_number_words = 0
    counts = defaultdict(lambda: defaultdict(lambda: 0.0))
    for sentence in data:
        sentence = tuple(sentence)
        for i in range(len(sentence)):
            total_number_words +=1
            for k in range(n):
                if i-k < 0:
                    break
                counts[sentence[i-k:i]][sentence[i]] +=1
    proba  = defaultdict(lambda: defaultdict(lambda: 0.0))
    for context in counts.keys():

        denom =0
        v_val = 0
        for w in counts[context].keys():
            denom += counts[context][w]
            v_val += 1
        for w in counts[context].keys():
            proba[context][w] = (counts[context][w] + 0.01) / (denom + 0.01* v_val)
    return proba

n_gram=4
print("build N-gram model with smooth n = ", n_gram)
LM2 = build_ngram_smooth(train_data, n_gram)
LM2

build N-gram model with smooth n =  4


defaultdict(<function __main__.build_ngram_smooth.<locals>.<lambda>()>,
            {(): defaultdict(<function __main__.build_ngram_smooth.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'O': 0.0014944613358656735,
                          'for': 0.006073057147580786,
                          'a': 0.012567383424448667,
                          'Muse': 7.682079165262595e-05,
                          'of': 0.018563619780376826,
                          'fire,': 0.00013429270560720897,
                          'that': 0.010115248429053125,
                          'would': 0.002337382740532891,
                          '<unk>': 0.132243065582542,
                          'The': 0.005038562696398292,
                          'brightest': 3.850618234957061e-05,
                          'heaven': 0.0004599668846831794,
                          'invention,': 3.850618234957061e-05,
                          'A': 0.0016094051637748396,
                          'ki

In [55]:
#perplexity LM1
def get_pro1(LM1, context, word):
  if context in LM1 and word in LM1[context]:
    return LM1[context][word]
  else:
    return 0.4*get_pro1(LM1, context[1:], word)

def perplexity1(LM1, data, n):
  pp1, T = 0.0, 0
  for sentence in data:
    sentence = tuple(sentence)
    for i in range(1, len(sentence)):
      nc = min(n-1, i)
      context = sentence[i-nc:i]
      pp1 += -math.log(get_pro1(LM1, context, sentence[i]))
      T += 1
  pp1 =  math.exp(pp1/T)
  return pp1


In [54]:
#perplexity LM2
def get_pro2(LM2, context, word):
  if context in LM2 and remove_rare_words in LM2[context]:
    return LM2[context][word]
  else:
    return 0.4*get_pro2(LM2, context[1:], word)

def perplexity2(LM2, data, n):
  pp2, T = 0.0, 0
  for sentence in data:
        sentence = tuple(sentence)
        for i in range(1, len(sentence)):
            nc = min(n-1, i)
            context = sentence[i-nc:i]
            perp += -math.log(get_pro2(LM2, context, sentence[i]))
            T += 1
  pp2 = math.exp(pp2/T)
  return pp2

In [None]:
# COMPUTE PERPLEXITY ON VALIDATION SET
print("The perplexity of model LM1 is : ", perplexity1(LM1 , train_data, n=30))
print("The perplexity of model LM2 is : ", perplexity1(LM2, train_data, n=30))

In [71]:
def get_proba_distrib(model, context):
    ## need to get the the words after the context and their probability of appearance
    ## after this context

    # code a recursive function over context
    # to find the longest available ngram

    ## FILL CODE

    if context in model:
        return model[context]
    else:
        return get_proba_distrib(model, context[1:])

In [77]:
def generate(model):

    # generate a sentence. A sentence starts with a <s> and ends with a </s>
    # Possiblly a use function is:
    # np.random.choice(x, 1, p = y)

    # where x is a list of things to sample from
    # and y is a list of probability (of the same length as x)
    sentence = ["<s>"]
    while sentence[-1] != "</s>" and len(sentence)<100:
        ## FILL CODE
        proba = get_proba_distrib(model, tuple(sentence))
        w = np.random.choice((list(proba.keys())), 1, p = list(proba.values()))
        sentence.append(w[0])
    return sentence

In [78]:
# GENERATE A SENTENCE FROM THE MODEL

print("Generated sentence: ",generate(LM1))
print("Generated sentence: ",generate(LM2))

Generated sentence:  ['<s>', 'BERTRAM,', 'and', 'the', 'two', 'FRENCH', 'LORDS,', 'with', 'a', '<unk>', '<unk>', 'I', 'throw', 'all', 'care', 'I', 'have', 'had', 'to', 'even', 'your', 'content', 'I', 'wish', 'well.', "'Tis", '<unk>', 'labour', 'on', 'death,', 'that', '<unk>', 'on', 'men,', 'monsieur?', 'A', 'word', 'with', 'you.', 'Our', 'Italy', 'for', '<unk>', 'a', '<unk>', 'for', '<unk>', 'name', 'strikes', 'more', 'can', '<unk>', 'in', 'which', 'I', '<unk>', 'thy', 'dear', 'love', 'to', '<unk>', 'thy', '<unk>', 'in', 'their', '<unk>', 'Yet', 'if', 'I', 'knew', 'him.', 'What', 'a', '<unk>', 'slave', 'is', 'this!', 'the', 'devil', 'should', 'move', 'me', 'to', 'undertake', 'the', '<unk>', 'very', 'poor', 'fellow.', 'his', 'noble', '<unk>', 'and,', 'believe', "'t,", 'Helen,', "that's", 'dead,', 'my', 'Queen.']
Generated sentence:  ['<s>', 'heel', 'of', 'that,', '<unk>', 'him.', 'You', 'shall', 'see', 'his', 'fall', 'to-night;', 'for', 'indeed', 'he', 'is', 'hath', 'reference', 'to', '