In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Dataset Load

In [None]:
import pandas as pd

# Dataset ka path
dataset_path = "/content/drive/MyDrive/Plagiarism-detection/train_snli.txt"

# Load dataset
df = pd.read_csv(dataset_path, delimiter="\t", header=None, names=["sentence1", "sentence2", "label"])

print(df.head())

                                           sentence1  \
0  A person on a horse jumps over a broken down a...   
1  A person on a horse jumps over a broken down a...   
2              Children smiling and waving at camera   
3              Children smiling and waving at camera   
4  A boy is jumping on skateboard in the middle o...   

                                       sentence2  label  
0  A person is at a diner, ordering an omelette.      0  
1              A person is outdoors, on a horse.      1  
2                     There are children present      1  
3                          The kids are frowning      0  
4              The boy skates down the sidewalk.      0  


Dataset cleaning

In [None]:
import re

# Function to clean text
def clean_text(text):
    text = text.lower()  # Lowercase me convert
    text = re.sub(r'\W', ' ', text)  # Special characters remove
    text = re.sub(r'\s+', ' ', text).strip()  # Extra spaces remove
    return text

# Apply Cleaning
df["sentence1"] = df["sentence1"].apply(clean_text)

print(df.head())  # Cleaned data check


                                           sentence1  \
0  a person on a horse jumps over a broken down a...   
1  a person on a horse jumps over a broken down a...   
2              children smiling and waving at camera   
3              children smiling and waving at camera   
4  a boy is jumping on skateboard in the middle o...   

                                       sentence2  label  
0  A person is at a diner, ordering an omelette.      0  
1              A person is outdoors, on a horse.      1  
2                     There are children present      1  
3                          The kids are frowning      0  
4              The boy skates down the sidewalk.      0  


We will create a Bigram ,Basically pair of words

In [None]:
import nltk
from nltk.util import ngrams
from collections import Counter

nltk.download('punkt')

# Function to generate N-grams (fix included)
def generate_ngrams(text, n=2):
    words = nltk.word_tokenize(text.lower())  # Lowercase & tokenize properly
    return list(ngrams(words, n))

# Sentence 1 ke proper bigrams banao
df["sentence1_ngrams"] = df["sentence1"].apply(lambda x: generate_ngrams(x, 2))

# N-grams list ko properly store karo
all_ngrams = [gram for grams in df["sentence1_ngrams"] for gram in grams]

# Proper frequency count
ngram_freq = Counter(all_ngrams)

print("✅ N-gram Training Fixed!")
print(ngram_freq.most_common(10))  # Top 10 bigrams print karo


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


✅ N-gram Training Fixed!
[(('in', 'a'), 82745), (('a', 'man'), 68400), (('on', 'a'), 49988), (('in', 'the'), 38621), (('with', 'a'), 35727), (('a', 'woman'), 34346), (('of', 'a'), 30039), (('man', 'in'), 27484), (('on', 'the'), 25113), (('wearing', 'a'), 21754)]


In [None]:
def debug_probability(sentence, ngram_freq, vocab_size, alpha=0.01):
    ngrams_list = generate_ngrams(sentence, 2)
    for ngram in ngrams_list:
        count = ngram_freq[ngram] if ngram in ngram_freq else 0
        prob = (count + alpha) / (sum(ngram_freq.values()) + (vocab_size * alpha))  # Smoothing
        print(f"N-gram: {ngram}, Count: {count}, Probability: {prob:.10f}")  # Debug print

debug_probability(test_sentence, ngram_freq, vocab_size)


N-gram: ('a', 'person'), Count: 7346, Probability: 0.0016758285
N-gram: ('person', 'is'), Count: 1647, Probability: 0.0003757286
N-gram: ('is', 'riding'), Count: 2205, Probability: 0.0005030239
N-gram: ('riding', 'a'), Count: 3466, Probability: 0.0007906929
N-gram: ('a', 'horse'), Count: 1339, Probability: 0.0003054653


In [None]:
print(f"Total Unique Bigrams: {len(ngram_freq)}")  # Kitne bigrams hai
print(f"Vocab Size: {vocab_size}")  # Vocab Size proper hai ya nahi


Total Unique Bigrams: 228149
Vocab Size: 228149


In [None]:
test_sentence = "A person is riding a horse"
test_ngrams = generate_ngrams(test_sentence, 2)

for ngram in test_ngrams:
    print(f"N-gram: {ngram}, Count in Training: {ngram_freq[ngram]}")


N-gram: ('a', 'person'), Count in Training: 7346
N-gram: ('person', 'is'), Count in Training: 1647
N-gram: ('is', 'riding'), Count in Training: 2205
N-gram: ('riding', 'a'), Count in Training: 3466
N-gram: ('a', 'horse'), Count in Training: 1339


Now we will check Perplexity Score

In [None]:
import numpy as np

def calculate_perplexity_safe(sentence, ngram_freq, vocab_size, alpha=0.01):
    ngrams_list = generate_ngrams(sentence, 2)
    log_probability = 0
    total_bigrams = sum(ngram_freq.values()) + (vocab_size * alpha)

    for ngram in ngrams_list:
        count = ngram_freq[ngram] if ngram in ngram_freq else 0
        prob = (count + alpha) / total_bigrams
        log_probability += np.log(prob) if prob > 0 else 0  # Log only if prob > 0

    perplexity = np.exp(-log_probability / max(len(ngrams_list), 1))  # Avoid division by 0
    return perplexity

# Test Again
perplexity_safe = calculate_perplexity_safe(test_sentence, ngram_freq, vocab_size)

print(f"📌 Final Safe Perplexity Score: {perplexity_safe:.2f}")


📌 Final Safe Perplexity Score: 1672.12
