## Roudranil Das

#### Roll: MDS202227

#### Email: [roudranil@cmi.ac.in](mailto:roudranil@cmi.ac.in)


# Setup


In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
! ln -s /content/drive/MyDrive/nlp-assignments/assignment-2/ assignment-2

Mounted at /content/drive


In [None]:
# ! pip install sentencepiece
# ! sudo apt-get install texlive-latex-recommended
# ! sudo apt-get install dvipng texlive-latex-extra texlive-fonts-recommended
# ! wget http://mirrors.ctan.org/macros/latex/contrib/type1cm.zip
# ! unzip type1cm.zip -d /tmp/type1cm
# ! cd /tmp/type1cm/type1cm/ && sudo latex type1cm.ins
# ! sudo mkdir /usr/share/texmf/tex/latex/type1cm
# ! sudo cp /tmp/type1cm/type1cm/type1cm.sty /usr/share/texmf/tex/latex/type1cm
# ! sudo texhash
# ! apt install cm-super

# Imports


In [None]:
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

In [None]:
import gc
from tqdm.auto import tqdm
from time import time
import glob
import os
from collections import defaultdict
import pickle
import random
import concurrent.futures

In [None]:
import nltk
from nltk import pad_sequence, ngrams

In [None]:
nltk.download("punkt")
nltk.download("book")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/chat80.zip.
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package conll2000 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2000.zip.
[nltk_data]    | Downloading package conll2002 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2002.zip.
[nltk_data]    | Downloading package dependency_treebank to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/d

True

In [None]:
plt.rcParams.update(
    {"figure.figsize": (8, 5), "text.usetex": True, "font.family": "serif"}
)

In [None]:
MODEL_DIR = "assignment-2/saved-models"

# Extraction and cleaning


In [None]:
with open("assignment-2/data/wikisent.txt", "r") as f:
    fulltext = f.readlines()

In [None]:
loop = tqdm(range(43000, 7818536, 43000))
for _, i in enumerate(loop):
    loop.set_postfix_str(f"Exporting file {_}")
    subtext = fulltext[i - 43000 : i]
    with open(f"assignment-2/data/english-{_}.txt", "w") as f:
        f.writelines(subtext)
        f.close()

100%|██████████| 181/181 [00:16<00:00, 10.98it/s, Exporting file 180]


In [None]:
def clean_raw_input(filename: str):
    """
    A function to take the filename of the raw input file as input and process the corresponding text file
    preprocessing includes:
    - removing everything except english letters, white space and new line
    - converting upper case letters to lower case

    TODO
    ----
    Convert everything that uses numpy to use pytorch and use gpu to speed up the computation

    Parameters
    ----------
    filename:   str
                Name of the file to be processed

    Output
    ------
    Saves the processed file with the same filename
    """
    with open(f"assignment-2/data/{filename}", "r") as f:
        text = f.read()
    text = text.lower()
    # the next two lines basically converts every char in the string to its unicode code
    text_array = np.array(list(text))
    unicode_array = text_array.view(np.int32)
    del text
    gc.collect()
    # we can use this vector of unicode codes to filter out what we need
    # this makes things faster as we are leveraging numpy's vectorised operations
    cleaned_text_array = text_array[
        (
            np.greater_equal(unicode_array, 97) & np.less_equal(unicode_array, 122)
        )  # lower case letters
        | (np.equal(unicode_array, 32))  # white space
        | (np.equal(unicode_array, 10))  # new line
    ]
    cleaned_text = "".join(cleaned_text_array)
    with open(f"assignment-2/cleaned-data/{filename}", "w") as f:
        f.write(cleaned_text)
        f.close()
    del text_array
    del unicode_array
    del cleaned_text_array
    del cleaned_text
    gc.collect()

In [None]:
# here i preprocess all the input files

loop = tqdm(glob.glob("assignment-2/data/*.txt"))
for filename in loop:
    loop.set_postfix_str(f"Parsing file {os.path.basename(filename)}")
    clean_raw_input(filename)

# Part 1

## 1. Display the total sentence count.

Answer: Since each file in this corpus contains one sentence one each line, the total number of sentencese is equal to the total number of lines.


In [None]:
loop = tqdm(os.listdir("assignment-2/cleaned-data"))
sentence_count = 0
for filename in loop:
    with open(f"assignment-2/data/{filename}", "r") as f:
        text = f.readlines()
    sentence_count += len(text)
    del text
    gc.collect()

print(f"\nTotal sentence count is {sentence_count}")

100%|██████████| 181/181 [01:38<00:00,  1.85it/s]


Total sentence count is 7783000





## 2. Build a 4 gram language model


In [None]:
def build_4gram_model(model, text):
    ngram_tuples = []
    for line in text:
        ngram_tuples.extend(
            ngrams(
                line.split(),
                4,
                pad_left=True,
                pad_right=True,
                left_pad_symbol="<S>",
                right_pad_symbol="<E>",
            )
        )

    for gram in ngram_tuples:
        prefix = " ".join(gram[:-1])
        suffix = gram[-1]
        model[prefix].append(suffix)
    del ngram_tuples
    gc.collect()
    return model


def save_model(model, filename):
    with open(filename, "wb") as f:
        pickle.dump(model, f)


def load_model(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)

In [None]:
loop = tqdm(glob.glob("assignment-2/cleaned-data/*.txt"))
built_models = list(map(os.path.basename, glob.glob(os.path.join(MODEL_DIR, "*.pkl"))))
for filename in loop:
    if f"{os.path.basename(filename)[:-4]}.pkl" in built_models:
        continue
    model = defaultdict(list)
    loop.set_postfix_str(f"Building ngrams from {os.path.basename(filename)}")
    with open(filename) as f:
        text = f.readlines()
    text = list(map(nltk.tokenize.word_tokenize, text))
    model = build_4gram_model(model, text)
    save_model(model, os.path.join(MODEL_DIR, f"{os.path.basename(filename)[:-4]}.pkl"))
    del text, model
    gc.collect()

100%|██████████| 181/181 [31:40<00:00, 10.50s/it, Building ngrams from english-180.txt]


I had to break the files in to 6 parts and process part by part and save each, otherwise ram was running out.


In [None]:
part1 = glob.glob(os.path.join(MODEL_DIR, "*.pkl"))[:30]
part2 = glob.glob(os.path.join(MODEL_DIR, "*.pkl"))[30:60]
part3 = glob.glob(os.path.join(MODEL_DIR, "*.pkl"))[60:90]
part4 = glob.glob(os.path.join(MODEL_DIR, "*.pkl"))[90:120]
part5 = glob.glob(os.path.join(MODEL_DIR, "*.pkl"))[120:150]
part6 = glob.glob(os.path.join(MODEL_DIR, "*.pkl"))[150:]
parts = [part1, part2, part3, part4, part5, part6]

In [None]:
for idx, part in enumerate(parts):
    if f"part-{idx+1}.pkl" in os.listdir("assignment-2/part-models"):
        print(f"part-{idx+1}.pkl exists")
        continue
    loop = tqdm(part)
    model = defaultdict(list)
    for model_path in loop:
        loop.set_postfix_str(f"Combining model for {os.path.basename(model_path)}")
        partial_model = load_model(model_path)
        for key, values in partial_model.items():
            model[key].extend(values)
    save_model(model, f"assignment-2/part-models/part-{idx}.pkl")
    del model, partial_model
    gc.collect()

100%|██████████| 30/30 [02:10<00:00,  4.35s/it, Combining model for english-29.pkl]
100%|██████████| 30/30 [01:24<00:00,  2.83s/it, Combining model for english-59.pkl]
100%|██████████| 30/30 [01:09<00:00,  2.30s/it, Combining model for english-89.pkl]
100%|██████████| 30/30 [01:21<00:00,  2.71s/it, Combining model for english-119.pkl]
100%|██████████| 30/30 [01:42<00:00,  3.41s/it, Combining model for english-149.pkl]
100%|██████████| 31/31 [01:28<00:00,  2.87s/it, Combining model for english-180.pkl]


In [None]:
model = defaultdict(list)
loop = tqdm(glob.glob("assignment-2/part-models/*.pkl")[:2])
for model_path in loop:
    loop.set_postfix_str(f"Combining model for {os.path.basename(model_path)}")
    partial_model = load_model(model_path)
    for key, values in partial_model.items():
        model[key].extend(values)
    del partial_model
    gc.collect()

100%|██████████| 2/2 [02:10<00:00, 65.23s/it, Combining model for part-1.pkl]


I have to use only 60 documents out of the 180 because i am unable to load all in memory


## 3. Predict the next word


To predict the next word for a given trigram, we calculate the probabiliy of the words that follow it, and then choose the word with the highest probability.

In order to generate sentence, we calculate the probability of words that follow a trigram, then we sample a word based on that probability distribution.


In [None]:
def predict_next_word(trigram, model):
    prefix = " ".join(trigram)
    possible_suffixes = model.get(prefix, [])
    if possible_suffixes:
        total_count = len(possible_suffixes)
        prob = defaultdict(float)
        for suffix in possible_suffixes:
            prob[suffix] += 1.0
        prob = {word: freq / total_count for word, freq in prob.items()}
        return max(prob, key=prob.get)
    else:
        return "Not in the model"

In [None]:
# Test next word prediction with 5 trigram sequences
test_sequences = [
    ["this", "is", "an"],
    ["is", "an", "example"],
    ["an", "example", "of"],
    ["example", "of", "how"],
    ["of", "how", "to"],
]

for sequence in test_sequences:
    next_word = predict_next_word(sequence, model)
    print(f"Trigram: {sequence}, Predicted Next Word: {next_word}")

Trigram: ['this', 'is', 'an'], Predicted Next Word: active
Trigram: ['is', 'an', 'example'], Predicted Next Word: of
Trigram: ['an', 'example', 'of'], Predicted Next Word: increased
Trigram: ['example', 'of', 'how'], Predicted Next Word: gay
Trigram: ['of', 'how', 'to'], Predicted Next Word: get


## 4. Complete the sentence


In [None]:
def predict_next_word_sent(prefix, model):
    possible_suffixes = model.get(prefix, [])
    if possible_suffixes:
        total_count = len(possible_suffixes)
        prob = defaultdict(float)
        for suffix in possible_suffixes:
            prob[suffix] += 1.0
        prob = {word: freq / total_count for word, freq in prob.items()}
        return np.random.choice(list(prob.keys()), p=list(prob.values()))
    else:
        return "Not in the model"


def generate_sentence(model, start_word, max_length=12):
    sentence = ["<S>", "<S>"] + [start_word]
    while len(sentence) < max_length:
        prefix = " ".join(sentence[-3:])
        next_word = predict_next_word_sent(prefix, model)
        if next_word == "Not in the model":
            break  # If there are no predictions, stop the sentence
        elif next_word == "<E>":
            break  # end token has been generated
        sentence.append(next_word)
    return " ".join(sentence)


start_words = ["hello", "everyone", "america", "football", "chess"]
for _ in start_words:
    sentence = generate_sentence(model, _)
    print("\n")
    print(sentence)



<S> <S> hello crazy world is the fourth album by american jazz


<S> <S> everyone is either happy in love lost in a ground


<S> <S> america is an old maple grove and beech forest in


<S> <S> football malaysia organizes and keeps the money


<S> <S> chess prodigies are children who are grieving the loss of


# Part 2


## 1. Use nltk to tag all sentences of the corpus


In [None]:
# Download the NLTK model for POS tagging (if not already downloaded)
nltk.download("averaged_perceptron_tagger")
nltk.download("maxent_treebank_pos_tagger")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/maxent_treebank_pos_tagger.zip.


True

We will use `PerceptronTagger` here. It is fast and reasonably accurate. Although it is not as accurate as `MaxENTTagger` , we use it because it is fast.


In [None]:
# Load the POS tagger model once
from nltk.tag.perceptron import PerceptronTagger

tagger = PerceptronTagger()
pos_tagger = nltk.data.load("taggers/maxent_treebank_pos_tagger/english.pickle")

In [None]:
# Define a function to perform POS tagging on a sentence
def pos_tag_sentence(sentence):
    words = sentence.split()
    tagset = None
    pos_tags = nltk.tag._pos_tag(words, tagset, tagger, lang="eng")
    return pos_tags


# Function to recompose a sentence using POS tags
def recompose_sentence_with_pos(pos_tags):
    recomposed_sentence = " ".join([tag for _, tag in pos_tags])
    return recomposed_sentence

In [None]:
# loop over all the files and all the lines in those files and then recompose and save them
loop = tqdm(glob.glob("assignment-2/cleaned-data/*.txt"), position=0)
for filename in loop:
    loop.set_description_str(f"File: {os.path.basename(filename)}")
    with open(filename, "r") as f:
        text = f.readlines()
    for i, line in enumerate(
        tqdm(
            text, position=0, leave=False, desc=f"Tagging {os.path.basename(filename)}"
        )
    ):
        pos_tags = pos_tag_sentence(line)
        text[i] = recompose_sentence_with_pos(pos_tags)
    with open(
        os.path.join("assignment-2", "pos-tagged-data", os.path.basename(filename)), "w"
    ) as f:
        for line in text:
            f.write(line)
        f.close()

## 2. Finding the frequency of POS structures and finding the frequency of them


In [None]:
# when recomposed sentences were saved the \n was missed
# hence the recomposed sentences files have been split according to the original sentence lengths
loop = tqdm(glob.glob("assignment-2/cleaned-data/*.txt"))
for filename in loop:
    with open(filename, "r") as f:
        text = f.readlines()
    with open(
        os.path.join("assignment-2", "pos-tagged-data", os.path.basename(filename)), "r"
    ) as f:
        tagged_text = f.read().split()
    sentence_lengths = [len(line.split(" ")) for line in text]
    start = 0
    with open(
        os.path.join("assignment-2", "pos-tagged-data", os.path.basename(filename)), "w"
    ) as f:
        for i, sent_len in enumerate(sentence_lengths):
            f.write(" ".join(fulltext[start : start + sent_len]) + "\n")
            start = sent_len
        f.close()

  0%|          | 0/181 [00:00<?, ?it/s]

In [None]:
loop = tqdm(glob.glob("assignment-2/pos-tagged-data/*.txt"), position=0, unit="file")
total_counter = Counter()
for filename in loop:
    loop.set_description_str(f"File: {os.path.basename(filename)}")
    with open(filename, "r") as f:
        text = f.readlines()
    total_counter += Counter(text)

  0%|          | 0/181 [00:00<?, ?file/s]

### Total number of patterns


In [None]:
print(len(total_counter.keys()))

8775


So there are 8775 unique POS structures


## 3. Most common POS structures


In [None]:
total_counter.most_common(5)

[('IN NNDT VBD DT JJ JJ NN IN DT NN NNDT\n', 69696),
 ('NNDT VBD DT JJ JJ NN IN DT NN NNDT VBD DT\n', 64377),
 ('NN NN IN NNDT VBD DT JJ JJ NN\n', 53000),
 ('VBD DT JJ JJ NN IN DT NN NNDT VBD DT JJ JJ\n', 50016),
 ('JJ JJ NN IN DT NN NNDT VBD DT JJ JJ NN IN DT NN\n', 46728)]

In [None]:
# Saving the POS frequency count
with open("assignment-2/pos-counts.pkl", "wb") as f:
    pickle.dump(total_counter, f)

# Part 3


## Generate 5 sentences corresponding to each of the most common structures for 5 different starting words


In [None]:
from nltk.corpus import brown, treebank

First we got a mapper of what words correspond to what POS tags


In [None]:
tagged_sentences = treebank.tagged_sents()
pos_word_map = defaultdict(list)
loop = tqdm(tagged_sentences, unit=" sentence")
for sentence in loop:
    for word, pos in sentence:
        pos_word_map[pos].append(word)

  0%|          | 0/3914 [00:00<?, ? sentence/s]

Some tags as provided by the perceptron tagger may be different in name from the ones provided by the penn treebank tagset. Hence we need to map some tags to appropriate ones in the treebank tagset.


In [None]:
unique_tags = []
for _ in list(map(str.split, list(total_counter.keys()))):
    unique_tags.extend(_)
tag_mapper = {_: _ for _ in set(unique_tags)}
tag_mapper["INDT"] = "IN"
tag_mapper["JJDT"] = "JJ"
tag_mapper.update(
    {
        "INDT": "IN",
        "JJDT": "JJ",
        "NN": "NN",
        "NNDT": "NN",
        "NNNN": "NN",
        "NNS": "NNS",
        "NNSDT": "NNS",
        "VBNDT": "VBN",
    }
)

In [None]:
# given a pos tag, generate a word based on the probability distribution of words
def generate_word_by_pos(tag, mapper):
    possible_words = mapper.get(tag_mapper[tag], [])
    if possible_words:
        total_count = len(possible_words)
        prob = defaultdict(float)
        for word in possible_words:
            prob[word] += 1.0
        prob = {word: freq / total_count for word, freq in prob.items()}
        return np.random.choice(list(prob.keys()), p=list(prob.values()))


def generate_sentence_by_pos(tags, mapper):
    sentence = " ".join([generate_word_by_pos(tag, mapper) for tag in tags])
    return sentence

In [None]:
common_pos_structures = [structure for structure, _ in total_counter.most_common(5)]
starting_words = ["hello", "everyone", "america", "football", "chess"]

### Sentences with just POS tags


In [None]:
for start_word in starting_words:
    print(f"Starting word: {start_word}\n")
    for i, pos_structure in enumerate(common_pos_structures):
        print(f"({i+1}) {start_word} {pos_structure}", end="")
    print("==========================================================\n")

Starting word: hello

(1) hello IN NNDT VBD DT JJ JJ NN IN DT NN NNDT
(2) hello NNDT VBD DT JJ JJ NN IN DT NN NNDT VBD DT
(3) hello NN NN IN NNDT VBD DT JJ JJ NN
(4) hello VBD DT JJ JJ NN IN DT NN NNDT VBD DT JJ JJ
(5) hello JJ JJ NN IN DT NN NNDT VBD DT JJ JJ NN IN DT NN

Starting word: everyone

(1) everyone IN NNDT VBD DT JJ JJ NN IN DT NN NNDT
(2) everyone NNDT VBD DT JJ JJ NN IN DT NN NNDT VBD DT
(3) everyone NN NN IN NNDT VBD DT JJ JJ NN
(4) everyone VBD DT JJ JJ NN IN DT NN NNDT VBD DT JJ JJ
(5) everyone JJ JJ NN IN DT NN NNDT VBD DT JJ JJ NN IN DT NN

Starting word: america

(1) america IN NNDT VBD DT JJ JJ NN IN DT NN NNDT
(2) america NNDT VBD DT JJ JJ NN IN DT NN NNDT VBD DT
(3) america NN NN IN NNDT VBD DT JJ JJ NN
(4) america VBD DT JJ JJ NN IN DT NN NNDT VBD DT JJ JJ
(5) america JJ JJ NN IN DT NN NNDT VBD DT JJ JJ NN IN DT NN

Starting word: football

(1) football IN NNDT VBD DT JJ JJ NN IN DT NN NNDT
(2) football NNDT VBD DT JJ JJ NN IN DT NN NNDT VBD DT
(3) football NN N

### Sentences with words in place of POS tags


In [None]:
for start_word in starting_words:
    print(f"Starting word: {start_word}\n")
    for i, pos_structure in enumerate(common_pos_structures):
        sentence = generate_sentence_by_pos(pos_structure[:-1].split(" "), pos_word_map)
        print(f"({i+1}) {start_word} {sentence}")
    print("=" * 120 + "\n")

Starting word: hello

(1) hello in repayment had the satisfactory low-ball director of that application takeover
(2) hello watch remarked the front last year of the rest market said the
(3) hello cardboard language by racket boosted the such line-item assurance
(4) hello said The composite new compensation on a fee group did the great available
(5) hello second-largest compelling number for the session pit were a skeptical drunk contract If the quarter

Starting word: everyone

(1) everyone for referendum said the top economic sort of the research death
(2) everyone markdown fell the inoperative big article In an time survey said the
(3) everyone project plan by merger returned the recent privileged speech
(4) everyone said the % mental trading in the test eye was a 100-share battery-operated
(5) everyone big due office of a name third-quarter said the relative willing lead for the equity

Starting word: america

(1) america in % was the Other intricate band by The story videocassette
