<a href="https://colab.research.google.com/github/Pratyushk2003/NLP-Assignment/blob/main/Untitled26.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import PyPDF2
import docx
import nltk
from nltk.util import ngrams
from nltk import FreqDist
from math import log2

# Load NLTK data
nltk.download('punkt')

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    pdf_text = ''
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page in pdf_reader.pages:
            pdf_text += page.extract_text()
    return pdf_text

# Function to extract text from a DOCX file
def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text
    return text

# Function to preprocess text and calculate n-grams
def preprocess_and_calculate_ngrams(text, n):
    words = nltk.word_tokenize(text)
    n_grams = list(ngrams(words, n))
    return n_grams

# Function to calculate perplexity
def calculate_perplexity(model, test_set):
    total_log_prob = 0
    N = len(test_set)

    for i in range(2, N):
        trigram = (test_set[i - 2], test_set[i - 1], test_set[i])
        prob = model.get(trigram, 1e-10)  # Use a small value for unseen trigrams
        total_log_prob += -log2(prob)

    perplexity = 2 ** (total_log_prob / N)
    return perplexity

# Function to implement add-1 smoothing
def add_one_smoothing(ngrams):
    vocabulary = set(ngrams)
    smoothed_counts = {}

    for ngram in vocabulary:
        n_minus_1 = ngram[:2]
        count_n_minus_1 = ngrams.count(n_minus_1)
        count_ngram = ngrams.count(ngram)
        smoothed_prob = (count_ngram + 1) / (count_n_minus_1 + len(vocabulary))
        smoothed_counts[ngram] = smoothed_prob

    return smoothed_counts

# Function to implement interpolation
def interpolate_trigram_model(trigrams, lambda_1, lambda_2, lambda_3):
    unigrams = [gram[0] for gram in trigrams]
    bigrams = [(gram[0], gram[1]) for gram in trigrams]

    # Calculate unigram, bigram, and trigram probabilities
    unigram_probs = FreqDist(unigrams)
    bigram_probs = FreqDist(bigrams)
    trigram_probs = FreqDist(trigrams)

    # Interpolate probabilities
    interpolated_model = {}
    for trigram in trigrams:
        unigram = trigram[2]
        bigram = trigram[0:2]

        prob = (lambda_1 * trigram_probs[trigram] +
                lambda_2 * bigram_probs[bigram] +
                lambda_3 * unigram_probs[unigram])

        interpolated_model[trigram] = prob

    return interpolated_model

# Load and preprocess the document
document_path = '/content/Standard Service Agreement (1).pdf'  # Change to your document path
if document_path.endswith('.pdf'):
    text = extract_text_from_pdf(document_path)
elif document_path.endswith('.docx'):
    text = extract_text_from_docx(document_path)
else:
    raise ValueError("Unsupported file format")

# Tokenize and calculate trigrams
trigrams = preprocess_and_calculate_ngrams(text, 3)

# List the top 100 trigrams
top_100_trigrams = FreqDist(trigrams).most_common(100)
print("Top 100 Trigrams:")
for trigram, count in top_100_trigrams:
    print(trigram, count)

# Calculate perplexity for uni-gram, bi-gram, and tri-gram models
uni_grams = [gram[2] for gram in trigrams]
bi_grams = [(gram[1], gram[2]) for gram in trigrams]

perplexity_unigram = calculate_perplexity(FreqDist(uni_grams), uni_grams)
perplexity_bigram = calculate_perplexity(FreqDist(bi_grams), bi_grams)
perplexity_trigram = calculate_perplexity(FreqDist(trigrams), trigrams)

print("Perplexity (Unigram):", perplexity_unigram)
print("Perplexity (Bigram):", perplexity_bigram)
print("Perplexity (Trigram):", perplexity_trigram)

# Apply add-1 smoothing and calculate perplexity again
smoothed_trigrams = add_one_smoothing(trigrams)
perplexity_smoothed_trigram = calculate_perplexity(smoothed_trigrams, trigrams)

print("Perplexity (Add-1 Smoothing Trigram):", perplexity_smoothed_trigram)

# Implement interpolation with specified lambda values
lambda_1 = 0.7
lambda_2 = 0.2
lambda_3 = 0.1

interpolated_model = interpolate_trigram_model(trigrams, lambda_1, lambda_2, lambda_3)
print(interpolated_model)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Top 100 Trigrams:
('the', 'Service', 'Provider') 57
('Service', 'Provider', 'shall') 45
('of', 'this', 'Agreement') 30
('under', 'this', 'Agreement') 30
('.', 'Service', 'Provider') 28
('this', 'Agreement', '.') 24
('this', 'Agreement', ',') 23
(',', 'Service', 'Provider') 22
('by', 'the', 'Service') 19
('The', 'Service', 'Provider') 18
('NetApp', '’', 's') 18
('.', 'The', 'Service') 17
('Service', 'Provider', '’') 14
('Provider', '’', 's') 13
('of', 'Service', 'Provider') 12
('the', 'right', 'to') 11
('Service', 'Provider', 'agrees') 11
('Intellectual', 'Property', 'Rights') 10
('including', 'but', 'not') 10
('but', 'not', 'limited') 10
('not', 'limited', 'to') 10
('this', 'Agreement', 'shall') 10
('in', 'connection', 'with') 10
('’', 's', 'Personal') 10
('Service', 'Provider', 'is') 9
('in', 'this', 'Agreement') 9
('(', 'i', ')') 9
('the', 'Disclosing', 'Party') 9
('(', 'ii', ')') 9
('obligations', 'under', 'this') 9
(',', 'the', 'Service') 9
('s', 'Personal', 'Information') 9
('Serv

In [20]:
# Function to predict the next word using interpolated trigram model
def predict_next_word(word1, word2):
    candidates = [(trigram[2], prob) for trigram, prob in interpolated_model.items() if trigram[0] == word1 and trigram[1] == word2]
    candidates.sort(key=lambda x: x[1], reverse=True)
    return candidates[:5]

# Example usage
word1 = "content"
word2 = "and"
predictions = predict_next_word(word1, word2)
print(f"Predictions for '{word1} {word2}':")
for word, prob in predictions:
    print(f"{word}: {prob}")


Predictions for 'content and':
existence: 0.9999999999999999
