# N-grams

## Import librairies 

In [21]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/akaagi/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [17]:
import math
from collections import Counter, defaultdict
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
pd.set_option('display.max_colwidth', None)

## Load dataset

In [5]:
df_kaggle_poem_dataset = pd.read_csv("../data_raw/kaggle_poem_dataset.csv")
df_kaggle_poem_dataset.drop(columns=["Unnamed: 0", "Author", "Title", "Poetry Foundation ID"], inplace=True)
df_kaggle_poem_dataset.rename(columns={"Content": "poem"}, inplace=True)
df_kaggle_poem_dataset.tail(3)

Unnamed: 0,poem
15649,"(A fortune cookie)\nOminous inscrutable Chinese news\nto get just before Christmas,\nconsidering my reasonable health,\nmarriage spicy as moo-goo-gai-pan,\ncareer running like a not-too-old Chevrolet.\nNot bad, considering what can go wrong:\nthe bony finger of Uncle Sam\nmight point out my husband,\nmy own national guard,\nand set him in Afghanistan;\nmy boss could take a personal interest;\nthe pain in my left knee could spread to my right.\nStill, as the old year tips into the new,\nI insist on the infant hope, gooing and kicking\nhis legs in the air. I won't give in\nto the dark, the sub-zero weather, the fog,\nor even the neighbors' Nativity.\nTheir four-year-old has arranged\nhis whole legion of dinosaurs\nso they, too, worship the child,\njoining the cow and sheep. Or else,\nultimate mortals, they've come to eat\nox and camel, Mary and Joseph,\nthen savor the newborn babe."
15650,1\nOur last night in the house was not our last.\nWith two cats in the yard. Our movers took\nthe furniture in the morning.A country where\nthey turned back time.
15651,"If your house\nis a dress\nit’ll fit like\nLos Angeles\nred sun\nburning west,\ndeserts, fields,\nfor certain it will\ndrape even\na boy no less\nboy in disrepair\nwandering from shore\nto crest, others\nmistake his\nsearching for\ndespair, no,\nnever, but\nfor thirst,\ncloaked as\nhe is, warm,\nradiant in a\nhouse dress."


# Create corpus

In [15]:
corpus = " ".join(df_kaggle_poem_dataset['poem'])
corpus[:10]

'Dear Write'

## Tokenization

In [22]:
corpus = corpus.lower()
tokens = word_tokenize(corpus)
print("Number of tokens in the training set:",len(tokens))

vocab = set(tokens)
print("Vocabulary size:",len(vocab))

Number of tokens in the training set: 4757049
Vocabulary size: 141541


## Train n-grams

In [52]:
from collections import Counter, defaultdict
from nltk.tokenize import word_tokenize

def tokenize(text):
    """Tokenize the input text."""
    return word_tokenize(text)

def count_ngrams(tokens, n):
    """Counts n-grams."""
    ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    return Counter(ngrams)

def calculate_ngram_probabilities(df, column, n, k=0.00001):
    """Calculates n-gram probabilities."""
    train_tokens = " ".join(df[column]).lower()
    train_tokens = tokenize(train_tokens)
    
    vocab = set(train_tokens)
    V = len(vocab)
    ngram_counts = count_ngrams(train_tokens, n)
    n_minus_one_gram_counts = count_ngrams(train_tokens, n-1)
    ngram_probabilities = defaultdict(float)
    
    for ngram in ngram_counts:
        prefix = ngram[:-1]
        ngram_counts[ngram] += k
        n_minus_one_gram_counts[prefix] += k
        ngram_probabilities[ngram] = (ngram_counts[ngram] + k) / (n_minus_one_gram_counts[prefix] + k*V)

    return ngram_probabilities

## Example usage

In [63]:
n = 5  # n-gram size
k = 0.00001  # Change this to the desired value for smoothing parameter k

ngram_probabilities = calculate_ngram_probabilities(df_kaggle_poem_dataset, 'poem', n, k)

In [64]:
print(f"Number of {n}-grams:",len(ngram_probabilities))

Number of 5-grams: 4636227


In [65]:
ngram_probabilities

defaultdict(float,
            {('dear', 'writers', ',', 'i', '’'): 0.41401495392105725,
             ('writers', ',', 'i', '’', 'm'): 0.41401495392105725,
             (',', 'i', '’', 'm', 'compiling'): 0.0036047743849278464,
             ('i', '’', 'm', 'compiling', 'the'): 0.41401495392105725,
             ('’', 'm', 'compiling', 'the', 'first'): 0.41401495392105725,
             ('m', 'compiling', 'the', 'first', 'in'): 0.41401495392105725,
             ('compiling', 'the', 'first', 'in', 'what'): 0.41401495392105725,
             ('the', 'first', 'in', 'what', 'i'): 0.41401495392105725,
             ('first', 'in', 'what', 'i', 'hope'): 0.41401495392105725,
             ('in', 'what', 'i', 'hope', 'is'): 0.41401495392105725,
             ('what', 'i', 'hope', 'is', 'a'): 0.41401495392105725,
             ('i', 'hope', 'is', 'a', 'series'): 0.41401495392105725,
             ('hope', 'is', 'a', 'series', 'of'): 0.41401495392105725,
             ('is', 'a', 'series', 'of', 'publicati

## Perplexity

In [67]:
def calculate_perplexity(test_tokens, ngram_probabilities, n):
    """Calculates the perplexity of a test corpus given n-gram probabilities."""
    log_probability_sum = 0
    ngram_count = 0
    
    for i in range(len(test_tokens)-n+1):
        ngram = tuple(test_tokens[i:i+n])
        log_probability_sum += math.log2(ngram_probabilities[ngram])
        ngram_count += 1
    
    average_log_probability = -log_probability_sum / ngram_count
    perplexity = math.pow(2, average_log_probability)
    
    return perplexity

In [68]:
calculate_perplexity(tokens, ngram_probabilities, n)

2.592118637260716

In [71]:
def greedy_sampling(context, vocab, ngram_probabilities, n, max_length = 50):
    
    sentence = []

    if len(context) < (n-1):
        print("len(context) < n")
        return sentence

    context = context[-(n-1):]
    
    for i in range(max_length):

        probs = dict()
        
        for v in vocab:

            ngram = list(context)
            ngram.append(v)
            ngram = tuple(ngram)
            probs[v] = ngram_probabilities[ngram]

        best_token = max(probs, key=probs.get) # greedy 
        #print(best_v)
        #print(probs[best_v])
        
        if probs[best_token] == 0:
            print("prob = 0")
            return sentence
            
        sentence.append(best_token)
        context = list(context)[1:]
        context.append(best_token)
        context = tuple(context)
            
    return sentence  

In [73]:
context = ['the', 'one', 'at', 'the']
sentence = greedy_sampling(context, vocab, ngram_probabilities, n, max_length = 200)
print(" ".join(context) + " " +  " ".join(sentence))

the one at the end of the world , and the world is measured out in certain parts , and heaven by five great zones is taken up entire : one glowing with sundazzle and fierce heat ; and far away on either side the arctics , frozen with ice and rain , cerulean ; and , in between , two zones for sick mankind : through each of these a slanting path is cut where pass in line the zodiacal stars . northward the steep world rises to scythia and south of libya descends , where black styx and the lowest of the dead look on . in the north sky the snake glides like a river winding about the great and little bear— those stars that fear forever the touch of ocean ; southward they say profound night , mother of furies , sits tight-lipped among the crowding shades , or thence aurora draws the daylight back ; and where the east exhales the yellow morning , reddening evening lights her stars at last . iv . as for the winter , when the freezing rains confine the farmer , he may employ himself in preparati