<a href="https://colab.research.google.com/github/Raka7317/Adventure_project-web-d-/blob/main/lab_2(n_gram).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import re
import math
from collections import Counter

# -----------------------------
# 1. TEXT PREPROCESSING
# -----------------------------
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


# -----------------------------
# 2. TOKENIZATION
# -----------------------------
def tokenize(text):
    return text.split()


# -----------------------------
# 3. GENERATE N-GRAMS
# -----------------------------
def generate_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]


# -----------------------------
# 4. PROBABILITY WITHOUT SMOOTHING
# -----------------------------
def ngram_prob_no_smoothing(ngrams, lower_ngrams):
    ngram_count = Counter(ngrams)
    lower_count = Counter(lower_ngrams)

    probs = {}
    for ngram in ngram_count:
        prefix = ngram[:-1]
        probs[ngram] = ngram_count[ngram] / lower_count[prefix]
    return probs


# -----------------------------
# 5. PROBABILITY WITH LAPLACE SMOOTHING
# -----------------------------
def ngram_prob_laplace(ngrams, lower_ngrams, vocab_size):
    ngram_count = Counter(ngrams)
    lower_count = Counter(lower_ngrams)

    probs = {}
    for ngram in ngram_count:
        prefix = ngram[:-1]
        probs[ngram] = (ngram_count[ngram] + 1) / (lower_count[prefix] + vocab_size)
    return probs


# -----------------------------
# 6. PERPLEXITY CALCULATION
# -----------------------------
def calculate_perplexity(ngrams, probabilities):
    N = len(ngrams)
    log_sum = 0

    for ngram in ngrams:
        prob = probabilities.get(ngram, 1e-10)  # avoid log(0)
        log_sum += math.log(prob)

    return math.exp(-log_sum / N)


# -----------------------------
# 7. MAIN EXECUTION
# -----------------------------
text = "Data science is fun and data science is powerful"

# Preprocessing
clean_text = preprocess_text(text)

# Tokenization
tokens = tokenize(clean_text)
vocab_size = len(set(tokens))

# -------- BIGRAM --------
bigrams = generate_ngrams(tokens, 2)
unigrams = [(token,) for token in tokens]

bigram_probs_no = ngram_prob_no_smoothing(bigrams, unigrams)
bigram_pp_no = calculate_perplexity(bigrams, bigram_probs_no)

bigram_probs_la = ngram_prob_laplace(bigrams, unigrams, vocab_size)
bigram_pp_la = calculate_perplexity(bigrams, bigram_probs_la)

# -------- TRIGRAM --------
trigrams = generate_ngrams(tokens, 3)
bigram_prefixes = generate_ngrams(tokens, 2)

trigram_probs_no = ngram_prob_no_smoothing(trigrams, bigram_prefixes)
trigram_pp_no = calculate_perplexity(trigrams, trigram_probs_no)

trigram_probs_la = ngram_prob_laplace(trigrams, bigram_prefixes, vocab_size)
trigram_pp_la = calculate_perplexity(trigrams, trigram_probs_la)


# -----------------------------
# 8. OUTPUT
# -----------------------------
print("Tokens:", tokens)

print("\nBIGRAM PERPLEXITY")
print("Without smoothing:", bigram_pp_no)
print("With Laplace smoothing:", bigram_pp_la)

print("\nTRIGRAM PERPLEXITY")
print("Without smoothing:", trigram_pp_no)
print("With Laplace smoothing:", trigram_pp_la)


Tokens: ['data', 'science', 'is', 'fun', 'and', 'data', 'science', 'is', 'powerful']

BIGRAM PERPLEXITY
Without smoothing: 1.189207115002721
With Laplace smoothing: 3.158758147025058

TRIGRAM PERPLEXITY
Without smoothing: 1.2190136542044754
With Laplace smoothing: 3.364298418765503


In [4]:
from collections import defaultdict

In [7]:
corpus={ "<s> NLP is fun </s>",
        "<s> NLP is powrful </s>"}

unigram=defaultdict(int)

for sentence in corpus:
    words=sentence.split()
    for word in words:
        unigram[word]+=1

print("unigram couts:")
for word, count in unigram.items():
  print(word, ":", count)



print("\n vacabulary size : ",len(unigram))

unigram couts:
<s> : 2
NLP : 2
is : 2
fun : 1
</s> : 2
powrful : 1

 vacabulary size :  6


In [16]:
from collections import defaultdict
corpus={ "<s>i love NLP </s>",
        "<s> i love ML </s>"}

bigram=defaultdict(int)

for sentence in corpus:
    words=sentence.split()
    for i in range(len(words)-1):
        bg=(words[i],words[i+1])
        bigram[bg]+=1

print("birgram counts:")
for bg, count in bigram.items():
  print(bg, ":", count)





print("\n vacabulary size : ",len(bigram))


birgram counts:
('<s>i', 'love') : 1
('love', 'NLP') : 1
('NLP', '</s>') : 1
('<s>', 'i') : 1
('i', 'love') : 1
('love', 'ML') : 1
('ML', '</s>') : 1

 vacabulary size :  7


In [17]:
from collections import defaultdict

corpus={ "<s> data science is fun </s>",
        "<s>  data science is useful</s>"}

trigram=defaultdict(int)

for sentence in corpus:
    words=sentence.split()
    for i in range(len(words)-2):
        tg=(words[i],words[i+1],words[i+2])
        trigram[tg]+=1

print("trigram  counts:")
for tg, count in bigram.items():
  print(tg, ":", count)





print("\n vacabulary size : ",len(bigram))

trigram  counts:
('<s>i', 'love') : 1
('love', 'NLP') : 1
('NLP', '</s>') : 1
('<s>', 'i') : 1
('i', 'love') : 1
('love', 'ML') : 1
('ML', '</s>') : 1

 vacabulary size :  7


In [22]:
from  collection import defaultdict
import math
print(" enter training sentences (use<s> and </s>)")
print(" enter one sentence per line")
print(" type end to finish \n")
corpus=[]
while True:
   line =input()
   if line.strip().upper()==END:
      break
   corpus.append(line.strip().split)

print("\n training corpus: ")
for s in corpus:
  print(s)

ModuleNotFoundError: No module named 'collection'

In [23]:
from collections import defaultdict
import math

# FIX 1: Define the stop variable (sentinel) used in the loop
END_SENTINEL = "END"

print("Enter training sentences (use <s> and </s> markers for start/end).")
print("Enter one sentence per line.")
print(f"Type '{END_SENTINEL}' to finish input. \n")

corpus = []
while True:
    line = input()

    # 4 SPACES: Check the stop condition
    # FIX 2: Use the defined END_SENTINEL instead of END
    if line.strip().upper() == END_SENTINEL:
        break

    # 4 SPACES: This line is correctly inside the while loop now
    # FIX 3: Add parentheses to .split() to make it a function call
    if line.strip(): # Only process non-empty lines
        corpus.append(line.strip().split())

print("\n--- Training Corpus (Tokenized) ---")
for s in corpus:
    print(s)

Enter training sentences (use <s> and </s> markers for start/end).
Enter one sentence per line.
Type 'END' to finish input. 



KeyboardInterrupt: Interrupted by user

In [26]:
from  collection import defaultdict
import math
print(" enter training sentences (use<s> and </s>)")
print(" enter one sentence per line")
print(" type end to finish \n")
corpus=[]
while True:
   line =input()
   if line.strip().upper()==END:
      break
   corpus.append(line.strip().split)

print("\n training corpus: ")
for s in corpus:
  print(s)


test_sentence=input("\n enter test sentence: ")
test_tokens=test_sentence.split()


unigram=defaultdict(int)
bigram=defaultdict(int)
trigram=defaultdict(int)

vocab=set()

for sent in corpus:
   for i in range(len(sent)):
     unigram[sent[i]]+=1
     vocab.add(sent[i])

     if i<len(sent)-1:
       bigram[(sent[i], sent[i+1])]+=1

     if i<len(sent)-1:
       bigram[(sent[i], sent[i+1],sent[i+2])]+=1

V=len(vocab)

ImportError: cannot import name 'defaultdict' from 'collection' (/usr/local/lib/python3.12/dist-packages/collection/__init__.py)

In [25]:
!pip install collection

Collecting collection
  Downloading collection-0.1.6.tar.gz (5.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: collection
  Building wheel for collection (setup.py) ... [?25l[?25hdone
  Created wheel for collection: filename=collection-0.1.6-py3-none-any.whl size=5098 sha256=814bedff156a193cf9ef032804c071a73af73cdb1d06f373c9cb6f85f38e8d8e
  Stored in directory: /root/.cache/pip/wheels/12/bb/2e/e430efbb8e7a845a40292527c78c51d201db424b763ae2ccdb
Successfully built collection
Installing collected packages: collection
Successfully installed collection-0.1.6


In [32]:
from collections import defaultdict
import math

# Define the stop word for dynamic input
END_SENTINEL = "END"

print("Enter training sentences (use <s> and </s> markers).")
print("Enter one sentence per line.")
print(f"Type '{END_SENTINEL}' to finish input. \n")

corpus = []
while True:
    line = input()

    # 1. Check for the stop condition
    if line.strip().upper() == END_SENTINEL:
        break

    # 2. Tokenize and append (FIX: added parentheses to .split())
    if line.strip():
        corpus.append(line.strip().split())

print("\n--- Training Corpus (Tokenized) ---")
for s in corpus:
    print(s)

# Get test sentence input
test_sentence_raw = input("\n Enter test sentence: ")
test_tokens = test_sentence_raw.strip().split()


# --- N-GRAM COUNTING ---
unigram = defaultdict(int)
bigram = defaultdict(int)
trigram = defaultdict(int)
vocab = set()

for sent in corpus:
    # We iterate up to len(sent)
    for i in range(len(sent)):

        # 1. Unigram Count and Vocabulary
        unigram[sent[i]] += 1
        vocab.add(sent[i])

        # 2. Bigram Count (W_i, W_i+1)
        # We need at least one more word, so stop one token early
        if i < len(sent) - 1:
            bigram[(sent[i], sent[i+1])] += 1

        # 3. Trigram Count (W_i, W_i+1, W_i+2)
        # We need at least two more words, so stop two tokens early
        if i < len(sent) - 2:
            # FIX: Used 'trigram' dictionary instead of 'bigram'
            trigram[(sent[i], sent[i+1], sent[i+2])] += 1

V = len(vocab)

# --- Output Results ---
print("\n" + "=" * 40)
print(f"| {'N-GRAM COUNTING RESULTS':^38} |")
print("=" * 40)
print(f"| {'Vocabulary Size (V)':<25} | {V:>10} |")
print("-" * 40)

print("\n### Unigram Counts ###")
print(unigram)

print("\n### Bigram Counts ###")
print(bigram)

print("\n### Trigram Counts ###")

print(trigram)



# laplace smoothing

def bigram_log_prob(sentence):
  logp=0.0
  for i in range(len(sentence)-1):
   w1,w2=sentence[i],sentence[i+1]
   prob=(bigram[(w1,w2)]+1)/(unigram[w1]+V)
  return logp


def trigram_log_prob(sentence):
  logp: 0.0
  for i in range(len(sentence)-2):
    w1,w2,w3=sentence[i],sentence[i+1],sentence[i+2]
    prob=(trigram[(w1,w2,w3)]+1)/(bigram[(w1,w2)]+V)
    logp+=math.log(prob)
  return logp



  def perplexity(logp,sentence):
    N=len(sentence)-1
    return math.exp(-logp/N)







  #computer perplexity


  pp_bigram=perplexity(bigram_log_prob(test_sentence),test_sentence)
  pp_trigram=perplexity(trigram_log_prob(test_sentence),test_sentence)



  #output

  print("\n result\n")
  print("vacabulary size |v| = ",V)
  print("bigram perplexity : ", round(pp_bigram,4))
  print("trigram perplexity : ", round(pp_trigram,4))



  print("\n model comparison \n")

  if pp_trigram<pp_bigram:
      print("trigram model is better")
  else:
    print("bigram model is better.... ")



Enter training sentences (use <s> and </s> markers).
Enter one sentence per line.
Type 'END' to finish input. 

<s> my name is rakesh</s>
<s>rakesh is good </s>
END

--- Training Corpus (Tokenized) ---
['<s>', 'my', 'name', 'is', 'rakesh</s>']
['<s>rakesh', 'is', 'good', '</s>']

 Enter test sentence: rakesh is master

|        N-GRAM COUNTING RESULTS         |
| Vocabulary Size (V)       |          8 |
----------------------------------------

### Unigram Counts ###
defaultdict(<class 'int'>, {'<s>': 1, 'my': 1, 'name': 1, 'is': 2, 'rakesh</s>': 1, '<s>rakesh': 1, 'good': 1, '</s>': 1})

### Bigram Counts ###
defaultdict(<class 'int'>, {('<s>', 'my'): 1, ('my', 'name'): 1, ('name', 'is'): 1, ('is', 'rakesh</s>'): 1, ('<s>rakesh', 'is'): 1, ('is', 'good'): 1, ('good', '</s>'): 1})

### Trigram Counts ###
defaultdict(<class 'int'>, {('<s>', 'my', 'name'): 1, ('my', 'name', 'is'): 1, ('name', 'is', 'rakesh</s>'): 1, ('<s>rakesh', 'is', 'good'): 1, ('is', 'good', '</s>'): 1})


In [33]:
 from collections import defaultdict
import math
from itertools import chain

# Define the stop word for dynamic input
END_SENTINEL = "END"
SMOOTHING_K = 1 # k=1 for Add-One (Laplace) Smoothing

print("Enter training sentences (use <s> and </s> markers).")
print("Enter one sentence per line.")
print(f"Type '{END_SENTINEL}' to finish input. \n")

corpus = []
while True:
    line = input()

    if line.strip().upper() == END_SENTINEL:
        break

    if line.strip():
        corpus.append(line.strip().split())

print("\n--- Training Corpus (Tokenized) ---")
for s in corpus:
    print(s)

# Get test sentence input and tokenize (ensure to add start/end markers later)
test_sentence_raw = input("\n Enter test sentence: ")
test_tokens_raw = test_sentence_raw.strip().split()


# --- N-GRAM COUNTING (Training Data) ---
unigram = defaultdict(int)
bigram = defaultdict(int)
trigram = defaultdict(int)
vocab = set()

for sent in corpus:
    for i in range(len(sent)):

        # 1. Unigram Count and Vocabulary
        unigram[sent[i]] += 1
        vocab.add(sent[i])

        # 2. Bigram Count
        if i < len(sent) - 1:
            bigram[(sent[i], sent[i+1])] += 1

        # 3. Trigram Count
        if i < len(sent) - 2:
            trigram[(sent[i], sent[i+1], sent[i+2])] += 1

V = len(vocab)


# --- PROBABILITY AND PERPLEXITY FUNCTIONS ---

def bigram_log_prob(tokens_list, V, k=SMOOTHING_K):
    """Calculates the log probability of a sentence using smoothed Bigram model."""

    # Need to add <s> marker for the first Bigram context
    sentence = ['<s>'] + tokens_list

    # FIX: Initialize logp correctly
    logp = 0.0

    # We iterate from the first word W_1 (which follows <s>) up to the last word W_N
    for i in range(len(sentence) - 1):
        w1, w2 = sentence[i], sentence[i+1] # w1 is context, w2 is current word

        # Get counts from training data (0 if unseen)
        count_w1 = unigram.get(w1, 0)
        count_bigram = bigram.get((w1, w2), 0)

        # Laplace Smoothing: (Count(W1, W2) + k) / (Count(W1) + V*k)
        prob = (count_bigram + k) / (count_w1 + V * k)

        # FIX: Update logp inside the loop
        # We use math.log (natural log) here, so perplexity must use math.exp
        logp += math.log(prob)

    return logp


def trigram_log_prob(tokens_list, V, k=SMOOTHING_K):
    """Calculates the log probability of a sentence using smoothed Trigram model."""

    # Need to add <s> <s> markers for the first two Trigram contexts
    sentence = ['<s>', '<s>'] + tokens_list

    # FIX: Correctly initialize logp
    logp = 0.0

    # We iterate from the second word W_2 up to the last word W_N
    for i in range(len(sentence) - 2):
        w1, w2, w3 = sentence[i], sentence[i+1], sentence[i+2] # (w1, w2) is context, w3 is current word

        # Get counts from training data (0 if unseen)
        count_w1w2 = bigram.get((w1, w2), 0)
        count_trigram = trigram.get((w1, w2, w3), 0)

        # Laplace Smoothing: (Count(W1, W2, W3) + k) / (Count(W1, W2) + V*k)
        prob = (count_trigram + k) / (count_w1w2 + V * k)

        logp += math.log(prob)

    return logp





# FIX: Corrected indentation for the function definition
def perplexity(logp, N):
    """
    Calculates Perplexity.
    N is the number of predictions made (tokens - N_gram_order + 1, e.g., tokens - 1 for bigram).
    """
    if N == 0:
        return float('inf')

    # Formula: exp(-1/N * logP) because we used math.log (natural log)
    #
    return math.exp(-logp / N)


# --- EXECUTION AND OUTPUT ---

# Determine the number of predictions (N) for the test sentence
# For a bigram model, we make N_tokens predictions (W_1, W_2, ..., W_N)
N_bigram = len(test_tokens_raw)
N_trigram = len(test_tokens_raw)

# Calculate log probabilities
lp_bigram = bigram_log_prob(test_tokens_raw, V)
lp_trigram = trigram_log_prob(test_tokens_raw, V)

# Calculate perplexity
pp_bigram = perplexity(lp_bigram, N_bigram)
pp_trigram = perplexity(lp_trigram, N_trigram)


# --- Output Results (FIX: Corrected indentation for final output) ---

# Output Counts (kept for debugging)
print("\n" + "=" * 40)
print(f"| {'N-GRAM COUNTING RESULTS':^38} |")
print("=" * 40)
print(f"| {'Vocabulary Size (V)':<25} | {V:>10} |")
print("-" * 40)
print(f"Test Sentence: '{test_sentence_raw}'")

print("\n### Unigram Counts ###")
print(unigram)
print("\n### Bigram Counts ###")
print(bigram)
print("\n### Trigram Counts ###")
print(trigram)

print("\n\n--- Final Results ---\n")
print(f"Vocabulary Size |V| = {V}")
print(f"Bigram Perplexity : {pp_bigram:.4f}")
print(f"Trigram Perplexity : {pp_trigram:.4f}")


print("\n--- Model Comparison ---")
if pp_trigram < pp_bigram:
    print("Trigram model is better (lower perplexity).")
else:
    print("Bigram model is better (lower perplexity) or results are close.")

Enter training sentences (use <s> and </s> markers).
Enter one sentence per line.
Type 'END' to finish input. 

<s> i love data science</s>
<s> data science is good </s>
END

--- Training Corpus (Tokenized) ---
['<s>', 'i', 'love', 'data', 'science</s>']
['<s>', 'data', 'science', 'is', 'good', '</s>']

 Enter test sentence: data science is field

|        N-GRAM COUNTING RESULTS         |
| Vocabulary Size (V)       |          9 |
----------------------------------------
Test Sentence: 'data science is field'

### Unigram Counts ###
defaultdict(<class 'int'>, {'<s>': 2, 'i': 1, 'love': 1, 'data': 2, 'science</s>': 1, 'science': 1, 'is': 1, 'good': 1, '</s>': 1})

### Bigram Counts ###
defaultdict(<class 'int'>, {('<s>', 'i'): 1, ('i', 'love'): 1, ('love', 'data'): 1, ('data', 'science</s>'): 1, ('<s>', 'data'): 1, ('data', 'science'): 1, ('science', 'is'): 1, ('is', 'good'): 1, ('good', '</s>'): 1})

### Trigram Counts ###
defaultdict(<class 'int'>, {('<s>', 'i', 'love'): 1, ('i', 'lo

In [None]:
unigram, bigram, trigram, without smoothing , with smoothing , find perplexity,
training corpus, find valcbulary size,
and also word count of all n_grams