In [8]:
#Tiara Devanathan 223025166
#Prashantan Darshan Naidoo 223009965
#Mahir Syed 223018507

#References:
#https://www.kaggle.com/code/alvations/n-gram-language-model-with-nltk
#https://www.geeksforgeeks.org/n-gram-language-modelling-with-nltk/


import nltk
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import pandas as pd
import numpy as np
from tabulate import tabulate

nltk.download('punkt')
nltk.download('punkt_tab')

def getCorpus(corpusFilePath, csvField = ''):
  import pandas as pd
  df = pd.read_csv(corpusFilePath)
  return list(df[csvField].apply(word_tokenize))

def train_model(corpus,n):
  #Generate training data and vocabulary
  training_data, vocab = padded_everygram_pipeline(n, corpus)

  # Train MLE model
  model = MLE(n)
  model.fit(training_data, vocab)
  return model

def generate_ngram(text,n):
  tokenized_text = word_tokenize(text)
  return list(ngrams(pad_both_ends(tokenized_text,n),n-1))

from tabulate import tabulate
def probability_matrix(model,text,n):
  # Row headers (given word)
  given_words = generate_ngram(text, n)
  # Column headers (next word)
  next_words = word_tokenize(text)

  table_data = []
  # Add probabilities to the table data
  for given in given_words:
      row = [f"{' '.join(given):<15}"]  # Row header (given word)
      for next in next_words:
          prob = model.score(next, given)  # Probability
          row.append(f"{prob:.9f}")  # Add the probability to the row
      table_data.append(row)

  # Add column headers
  headers = [" "] + next_words

  # Print the table
  print(tabulate(table_data, headers=headers, tablefmt="grid"))


def evaluate_perplexity(model, test_sentence, n):
    test_data = list(ngrams(pad_both_ends(word_tokenize(test_sentence), n), n))

    log_prob_sum = 0
    word_count = 0

    for ngram in test_data:
        context, word = tuple(ngram[:-1]), ngram[-1]

        prob = model.score(word, context)

        if prob > 0:  # Avoid log(0) issues
            log_prob_sum += np.log2(prob)
            word_count += 1
        else:
            # Assign a small probability to avoid log(0)
            log_prob_sum += np.log2(1e-10)
            word_count += 1

    perplexity = 2 ** (-log_prob_sum / word_count) if word_count > 0 else float('inf')

    print(f"Perplexity for '{test_sentence}': {perplexity}")

def main():
    text = input("Enter test text: ")
    n = int(input("Enter n-gram size: "))

    corpus = getCorpus('Donald-Tweets!.csv', 'Tweet_Text')

    model = train_model(corpus, n)

    probability_matrix(model, text, n)

    evaluate_perplexity(model, text, n)

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Enter test text: Today we express our deepest gratitude to all those who have served in our armed forces.
Enter n-gram size: 5
+-------------------------------+------------+----------+-----------+-------+-----------+-------------+------+-------+---------+-------+--------+----------+----------+-------+---------+----------+-----------+
|                               |      Today |       we |   express |   our |   deepest |   gratitude |   to |   all |   those |   who |   have |   served |       in |   our |   armed |   forces |         . |
| <s> <s> <s> <s>               | 0.00122034 | 0        |      0    |     0 |         0 |           0 |    0 |     0 |     0   |     0 |      0 |        0 | 0        |     0 |       0 |        0 | 0.0447458 |
+-------------------------------+------------+----------+-----------+-------+-----------+-------------+------+-------+---------+-------+--------+----------+----------+-------+---------+----------+-----------+
| <s> <s> <s> Today             | 0  