# N-grams

## Import librairies 

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/akaagi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import math
from collections import Counter, defaultdict
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
pd.set_option('display.max_colwidth', None)
import re

## Load dataset

In [3]:
# df_kaggle_poem_dataset = pd.read_csv("../data_raw/kaggle_poem_dataset.csv")
# df_kaggle_poem_dataset.drop(columns=["Unnamed: 0", "Author", "Title", "Poetry Foundation ID"], inplace=True)
# df_kaggle_poem_dataset.rename(columns={"Content": "poem"}, inplace=True)
# df_kaggle_poem_dataset.tail(3)
df_kaggle_poem_dataset = pd.read_csv("../data_raw/our_dataset.csv")
df_kaggle_poem_dataset.drop(columns=["Unnamed: 0", "topic"], inplace=True)
df_kaggle_poem_dataset.rename(columns={"Content": "poem"}, inplace=True)

# Define the remove_non_english function
def remove_non_english(text):
    # Regular expression to remove non-English alphabet characters
    english_only = re.sub(r'[^a-zA-Z\s.,;?!\'-]', '', text)
    return english_only

# Define the create_corpus function
def create_corpus(text):
    # Tokenize text into sentences
    sentences = text.split('\n')
    # Remove empty strings
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    # Join sentences into a corpus
    corpus = '\n'.join(sentences)  # Keep newline characters
    return corpus

# Read the CSV file
csv_path = "/home/akaagi/Desktop/SCIA/nlp1/NLP1_Project/data_raw/our_dataset.csv"
df = pd.read_csv(csv_path)

# Apply remove_non_english function to poem column
df['poem'] = df['poem'].apply(remove_non_english)

# Apply create_corpus function to poem column
df['poem'] = df['poem'].apply(create_corpus)

# Remove \xa0
df['poem'] = df['poem'].str.replace('\xa0', '')

# Apply lowercase to the poem column
df['poem'] = df['poem'].str.lower()

df_kaggle_poem_dataset.tail(3)

Unnamed: 0,poem
10061,"I imagined the atmosphere would be clear, \nshot with pristine light, \nnot this sulphurous haze, \nthe air ionized as before a thunderstorm. \n\nMany have pictured a river here, \nbut no one mentioned all the boats, \ntheir benches crowded with naked passengers, \neach bent over a writing tablet. \n\nI knew I would not always be a child \nwith a model train and a model tunnel, \nand I knew I would not live forever, \njumping all day through the hoop of myself. \n\nI had heard about the journey to the other side \nand the clink of the final coin \nin the leather purse of the man holding the oar, \nbut how could anyone have guessed \n\nthat as soon as we arrived \nwe would be asked to describe this place \nand to include as much detail as possible— \nnot just the water, he insists, \n\nrather the oily, fathomless, rat-happy water, \nnot simply the shackles, but the rusty, \niron, ankle-shredding shackles— \nand that our next assignment would be \n\nto jot down, off the tops of our heads, \nour thoughts and feelings about being dead, \nnot really an assignment, \nthe man rotating the oar keeps telling us— \n\nthink of it more as an exercise, he groans, \nthink of writing as a process, \na never-ending, infernal process, \nand now the boats have become jammed together, \n\nbow against stern, stern locked to bow, \nand not a thing is moving, only our diligent pens."
10062,"Miss Nancy Ellicott\nStrode across the hills and broke them,\nRode across the hills and broke them—\nThe barren New England hills—\nRiding to hounds\nOver the cow-pasture.\n\nMiss Nancy Ellicott smoked\nAnd danced all the modern dances;\nAnd her aunts were not quite sure how they felt about it,\nBut they knew that it was modern.\n\nUpon the glazen shelves kept watch\nMatthew and Waldo, guardians of the faith,\nThe army of unalterable law."
10063,"In the dark we disappear, pure being.\nOur mirror images, impure being.\n\nBeing and becoming (Heidegger), being and\nnothingness (Sartre)—which is purer being?\n\nBeing alone is no way to be: thus\nloneliness is the test of pure being.\n\nNights in love I fell too far or not quite\nfar enough—one pure, one impure being.\n\nClouds, snow, mist, the dragon's breath on water,\nsmoke from fire—a metaphor's pure being.\n\nStillness and more stillness and the light locked\ndeep inside—both pure and impure being.\n\nIs is the verb of being, I the noun—\nor pronoun for the purists of being.\n\nI was, I am, I looked within and saw\nnothing very clearly: purest being."


# Create corpus

In [4]:
corpus = " ".join(df_kaggle_poem_dataset['poem'])
corpus[:10]

'Sunday we '

## Tokenization

In [5]:
corpus = corpus.lower()
tokens = word_tokenize(corpus)
print("Number of tokens in the training set:",len(tokens))

vocab = set(tokens)
print("Vocabulary size:",len(vocab))

Number of tokens in the training set: 3391123
Vocabulary size: 128887


## Train n-grams

In [6]:
from collections import Counter, defaultdict
from nltk.tokenize import word_tokenize

def tokenize(text):
    """Tokenize the input text."""
    return word_tokenize(text)

def count_ngrams(tokens, n):
    """Counts n-grams."""
    ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    return Counter(ngrams)

def calculate_ngram_probabilities(df, column, n, k=0.00001):
    """Calculates n-gram probabilities."""
    train_tokens = " ".join(df[column]).lower()
    train_tokens = tokenize(train_tokens)
    
    vocab = set(train_tokens)
    V = len(vocab)
    ngram_counts = count_ngrams(train_tokens, n)
    n_minus_one_gram_counts = count_ngrams(train_tokens, n-1)
    ngram_probabilities = defaultdict(float)
    
    for ngram in ngram_counts:
        prefix = ngram[:-1]
        ngram_counts[ngram] += k
        n_minus_one_gram_counts[prefix] += k
        ngram_probabilities[ngram] = (ngram_counts[ngram] + k) / (n_minus_one_gram_counts[prefix] + k*V)

    return ngram_probabilities

## Example usage

In [7]:
n = 5  # n-gram size
k = 0.00001  # Change this to the desired value for smoothing parameter k

ngram_probabilities = calculate_ngram_probabilities(df_kaggle_poem_dataset, 'poem', n, k)

In [8]:
print(f"Number of {n}-grams:",len(ngram_probabilities))

Number of 5-grams: 3253229


In [9]:
ngram_probabilities

defaultdict(float,
            {('sunday', 'we', 'lay', 'hands', 'on'): 0.4369036384607319,
             ('we', 'lay', 'hands', 'on', 'a'): 0.4369036384607319,
             ('lay', 'hands', 'on', 'a', 'girl'): 0.4369036384607319,
             ('hands', 'on', 'a', 'girl', 'of'): 0.4369036384607319,
             ('on', 'a', 'girl', 'of', 'ten'): 0.4369036384607319,
             ('a', 'girl', 'of', 'ten', 'hand'): 0.4369036384607319,
             ('girl', 'of', 'ten', 'hand', 'on'): 0.4369036384607319,
             ('of', 'ten', 'hand', 'on', 'hand'): 0.4369036384607319,
             ('ten', 'hand', 'on', 'hand', 'on'): 0.4369036384607319,
             ('hand', 'on', 'hand', 'on', 'cornsilk'): 0.4369036384607319,
             ('on', 'hand', 'on', 'cornsilk', 'hair'): 0.4369036384607319,
             ('hand', 'on', 'cornsilk', 'hair', '.'): 0.4369036384607319,
             ('on', 'cornsilk', 'hair', '.', 'we'): 0.4369036384607319,
             ('cornsilk', 'hair', '.', 'we', 'sing'): 0.436

## Perplexity

In [10]:
def calculate_perplexity(test_tokens, ngram_probabilities, n):
    """Calculates the perplexity of a test corpus given n-gram probabilities."""
    log_probability_sum = 0
    ngram_count = 0
    
    for i in range(len(test_tokens)-n+1):
        ngram = tuple(test_tokens[i:i+n])
        log_probability_sum += math.log2(ngram_probabilities[ngram])
        ngram_count += 1
    
    average_log_probability = -log_probability_sum / ngram_count
    perplexity = math.pow(2, average_log_probability)
    
    return perplexity

In [11]:
calculate_perplexity(tokens, ngram_probabilities, n)

2.3928231961111854

In [12]:
def greedy_sampling(context, vocab, ngram_probabilities, n, max_length = 50):
    
    sentence = []

    if len(context) < (n-1):
        print("len(context) < n")
        return sentence

    context = context[-(n-1):]
    
    for i in range(max_length):

        probs = dict()
        
        for v in vocab:

            ngram = list(context)
            ngram.append(v)
            ngram = tuple(ngram)
            probs[v] = ngram_probabilities[ngram]

        best_token = max(probs, key=probs.get) # greedy 
        #print(best_v)
        #print(probs[best_v])
        
        if probs[best_token] == 0:
            print("prob = 0")
            return sentence
            
        sentence.append(best_token)
        context = list(context)[1:]
        context.append(best_token)
        context = tuple(context)
            
    return sentence  

In [13]:
context = ['sunday','we','lay','hands','on']
n_values = range(1, 7)  # Iterate n from 1 to 6
for n in n_values:
    print(f"{n}-gram :\n")
    k = 0.00001  # Smoothing parameter k
    
    # Calculate n-gram probabilities
    ngram_probabilities = calculate_ngram_probabilities(df_kaggle_poem_dataset, 'poem', n, k)
    
    # Generate sentence using greedy sampling
    sentence = greedy_sampling(context, vocab, ngram_probabilities, n, max_length=200)
    
    # Print the context and generated sentence
    print(" ".join(context) + " " + " ".join(sentence))
    print("\n")


1-gram :

prob = 0
sunday we lay hands on 


2-gram :

sunday we lay hands on the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and the world , and


3-gram :

sunday we lay hands on the floor , and the world , and the world , and the world , and the world , and the wor