<a href="https://colab.research.google.com/github/ParvG2005/Parv/blob/main/ngram%20from%20scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import math
import re
from collections import Counter
from nltk.util import ngrams
from tqdm.auto import tqdm
from nltk.tokenize import RegexpTokenizer

def load_tokens(filename):
    with open(filename, encoding="utf-8",errors="ignore") as f:
        text = f.read()
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+|[\u0900-\u097F]+')
    return tokenizer.tokenize(text.lower())

train_tokens = load_tokens("wiki_movie_plots_deduped.csv")
val_tokens   = load_tokens("english_2500.txt")
test_tokens  = load_tokens("english_test.txt")
min_count = 250
unigrams_train = Counter(train_tokens)
words = [w for w in unigrams_train if unigrams_train[w] >= min_count]
unknown = "unknown"
if unknown not in words:
    words.append(unknown)

def replace_unknown(tokens, vocab):
    return [w if w in vocab else unknown for w in tokens]
train_tokens = replace_unknown(train_tokens, words)
val_tokens   = replace_unknown(val_tokens, words)
test_tokens  = replace_unknown(test_tokens, words)
clean_text = train_tokens
words = list(set(clean_text))
unigrams = Counter(clean_text)
bigrams  = Counter(ngrams(clean_text, 2))
trigrams = Counter(ngrams(clean_text, 3))

frequency = np.array([unigrams[w] for w in words])
freq = frequency / frequency.sum()
def bigram_probs(initial, words):
    context_count = unigrams[initial]
    probs = []
    for w in words:
        count = bigrams[(initial, w)]
        probs.append(count)
    j = 0.11
    final = (np.array(probs) + j) / (context_count + j*len(words))
    return final / final.sum()

def tri_prob(initial_1, initial_2, words):
    context_count = bigrams[(initial_1, initial_2)]
    probs = []
    for w in words:
        count = trigrams[(initial_1, initial_2, w)]
        probs.append(count)
    k = 0.11
    final = (np.array(probs) + k) / (context_count + k*len(words))
    return final / final.sum()

def sentence(words,clean_text, freq, n=10):
    string1 = np.random.choice(words, p=freq)
    string2 = np.random.choice(words, p=bigram_probs(string1, words))
    sol = [string1, string2]

    for _ in tqdm(range(n-2)):
        tri_dist = tri_prob(string1, string2, words)
        new = np.random.choice(words, p=tri_dist)
        sol.append(new)
        string1, string2 = string2, new
    return " ".join(sol)
def perplexity(sentence_tokens, words, unigrams, bigrams, trigrams):
    N = len(sentence_tokens)
    log_prob = 0.0
    w1 = sentence_tokens[0]
    p1 = (unigrams.get(w1, 0) + 1) / (sum(unigrams.values()) + len(words))
    log_prob += math.log(p1)
    if N > 1:
        w2 = sentence_tokens[1]
        count_bigram_context = unigrams.get(w1, 0)
        p2 = (bigrams.get((w1, w2), 0) + 1) / (count_bigram_context + len(words))
        log_prob += math.log(p2)
    for i in range(2, N):
        w_prev2, w_prev1, w_curr = sentence_tokens[i-2], sentence_tokens[i-1], sentence_tokens[i]
        count_context = bigrams.get((w_prev2, w_prev1), 0)
        p = (trigrams.get((w_prev2, w_prev1, w_curr), 0) + 1) / (count_context + len(words))
        log_prob += math.log(p)

    # Perplexity
    return math.exp(-log_prob / N)
generated = sentence(words, clean_text, freq, n=10)
print("Generated Sentence: ", generated)
sentence_tokens = generated.split()
pp = perplexity(sentence_tokens, words, unigrams, bigrams, trigrams)
pp_val = perplexity(val_tokens[:300], words, unigrams, bigrams, trigrams)
pp_test = perplexity(test_tokens[:150], words, unigrams, bigrams, trigrams)

print("Validation Perplexity:", pp_val)
print("Test Perplexity:", pp_test)
print("Perplexity:",pp)

KeyboardInterrupt: 

In [None]:
import numpy as np
import math
from collections import Counter
from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer

# -----------------------------
# Load datasets
# -----------------------------
def load_tokens(filename):
    with open(filename, encoding="utf-8",errors="ignore") as f:
        text = f.read().lower()
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+|[\u0900-\u097F]+')  # English + Hindi
    return tokenizer.tokenize(text)

train_tokens = load_tokens("wiki_movie_plots_deduped.csv")
val_tokens   = load_tokens("english_test.txt")
test_tokens  = load_tokens("english_30000.txt")

# -----------------------------
# Build Vocabulary from TRAIN only
# -----------------------------
min_count = 600
unigrams_train = Counter(train_tokens)
words = [w for w in unigrams_train if unigrams_train[w] >= min_count]

# Add UNK
UNK = "<UNK>"
if UNK not in words:
    words.append(UNK)

def replace_unk(tokens, vocab):
    return [w if w in vocab else UNK for w in tokens]

train_tokens = replace_unk(train_tokens, words)
val_tokens   = replace_unk(val_tokens, words)
test_tokens  = replace_unk(test_tokens, words)

# -----------------------------
# Build N-gram Counts from TRAIN
# -----------------------------
unigrams = Counter(train_tokens)
bigrams  = Counter(ngrams(train_tokens, 2))
trigrams = Counter(ngrams(train_tokens, 3))

# -----------------------------
# Probability Functions
# -----------------------------
def bigram_probs(initial, words):
    context_count = unigrams[initial]
    probs = []
    j = 0.11
    for w in words:
        count = bigrams[(initial, w)]
        probs.append(count)
    final = (np.array(probs) + j) / (context_count + j*len(words))
    return final / final.sum()

def tri_prob(initial_1, initial_2, words):
    context_count = bigrams[(initial_1, initial_2)]
    probs = []
    k = 0.11
    for w in words:
        count = trigrams[(initial_1, initial_2, w)]
        probs.append(count)
    final = (np.array(probs) + k) / (context_count + k*len(words))
    return final / final.sum()

# -----------------------------
# Perplexity Function
# -----------------------------
def perplexity(sentence_tokens, words, unigrams, bigrams, trigrams):
    N = len(sentence_tokens)
    log_prob = 0.0

    if N == 0:
        return float("inf")

    # First word
    w1 = sentence_tokens[0]
    p1 = (unigrams.get(w1, 0) + 1) / (sum(unigrams.values()) + len(words))
    log_prob += math.log(p1)

    # Second word
    if N > 1:
        w2 = sentence_tokens[1]
        count_bigram_context = unigrams.get(w1, 0)
        p2 = (bigrams.get((w1, w2), 0) + 1) / (count_bigram_context + len(words))
        log_prob += math.log(p2)

    # Remaining words (trigram)
    for i in range(2, N):
        w_prev2, w_prev1, w_curr = sentence_tokens[i-2], sentence_tokens[i-1], sentence_tokens[i]
        count_context = bigrams.get((w_prev2, w_prev1), 0)
        p = (trigrams.get((w_prev2, w_prev1, w_curr), 0) + 1) / (count_context + len(words))
        log_prob += math.log(p)

    return math.exp(-log_prob / N)

# -----------------------------
# Evaluate
# -----------------------------
pp_val = perplexity(val_tokens[:200], words, unigrams, bigrams, trigrams)
pp_test = perplexity(test_tokens[:200], words, unigrams, bigrams, trigrams)

print("Validation Perplexity:", pp_val)
print("Test Perplexity:", pp_test)


Validation Perplexity: 127.96149958426234
Test Perplexity: 90.05196419946563


In [None]:
import numpy as np
import math
import re
from collections import Counter
from nltk.util import ngrams
from tqdm.auto import tqdm
from nltk.tokenize import RegexpTokenizer

def load_tokens(filename):
    with open(filename, encoding="utf-8",errors="ignore") as f:
        text = f.read()
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+|[\u0900-\u097F]+')
    return tokenizer.tokenize(text.lower())

train_tokens = load_tokens("wiki_movie_plots_deduped.csv")
val_tokens   = load_tokens("english_30000.txt")
test_tokens  = load_tokens("english_test.txt")
min_count = 150
unigrams_train = Counter(train_tokens)
words = [w for w in unigrams_train if unigrams_train[w] >= min_count]
unknown = "unknown"
if unknown not in words:
    words.append(unknown)

def replace_unknown(tokens, vocab):
    return [w if w in vocab else unknown for w in tokens]
train_tokens = replace_unknown(train_tokens, words)
val_tokens   = replace_unknown(val_tokens, words)
test_tokens  = replace_unknown(test_tokens, words)
clean_text = train_tokens
words = list(set(clean_text))
unigrams = Counter(clean_text)
bigrams  = Counter(ngrams(clean_text, 2))
trigrams = Counter(ngrams(clean_text, 3))

frequency = np.array([unigrams[w] for w in words])
freq = frequency / frequency.sum()
def bigram_probs(initial, words):
    context_count = unigrams[initial]
    probs = []
    for w in words:
        count = bigrams[(initial, w)]
        probs.append(count)
    j = 0.11
    final = (np.array(probs) + j) / (context_count + j*len(words))
    return final / final.sum()

def tri_prob(initial_1, initial_2, words):
    context_count = bigrams[(initial_1, initial_2)]
    probs = []
    for w in words:
        count = trigrams[(initial_1, initial_2, w)]
        probs.append(count)
    k = 0.11
    final = (np.array(probs) + k) / (context_count + k*len(words))
    return final / final.sum()

def perplexity(sentence_tokens, words, unigrams, bigrams, trigrams):
    N = len(sentence_tokens)
    log_prob = 0.0
    w1 = sentence_tokens[0]
    p1 = (unigrams.get(w1, 0) + 0.11) / (sum(unigrams.values()) + 0.11*len(words))
    log_prob += math.log(p1)
    if N > 1:
        w2 = sentence_tokens[1]
        count_bigram_context = unigrams.get(w1, 0)
        p2 = (bigrams.get((w1, w2), 0) + 0.11) / (count_bigram_context + 0.11*len(words))
        log_prob += math.log(p2)
    for i in range(2, N):
        w_prev2, w_prev1, w_curr = sentence_tokens[i-2], sentence_tokens[i-1], sentence_tokens[i]
        count_context = bigrams.get((w_prev2, w_prev1), 0)
        p = (trigrams.get((w_prev2, w_prev1, w_curr), 0) + 0.11) / (count_context + 0.11*len(words))
        log_prob += math.log(p)

    # Perplexity
    return math.exp(-log_prob / N)

pp_val = perplexity(val_tokens[:300], words, unigrams, bigrams, trigrams)
pp_test = perplexity(test_tokens[:150], words, unigrams, bigrams, trigrams)

print("Validation Perplexity:", pp_val)
print("Test Perplexity:", pp_test)

Validation Perplexity: 242.42640692780338
Test Perplexity: 348.08969642940724
