In [13]:
import random
import nltk
from nltk.corpus import stopwords
from PyPDF2 import PdfReader

In [6]:
def extract_text_from_pdf(filename):
    with open(filename, 'rb') as pdf_file:
        pdf_reader = PdfReader(pdf_file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
        return text.strip()

In [7]:
def clean_text(text):
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    text = text.lower()

    filtered_text = [word for word in text.split() if word not in stop_words]
    return ' '.join(filtered_text)

In [8]:
pdf_text = extract_text_from_pdf("Corpus.pdf")
cleaned_text = clean_text(pdf_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sejbp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def create_ngrams(text, n):
    tokens = text.split()
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngrams.append(' '.join(tokens[i:i+n]))
    return ngrams

In [10]:
unigrams = create_ngrams(cleaned_text, 1)
bigrams = create_ngrams(cleaned_text, 2)
trigrams = create_ngrams(cleaned_text, 3)

In [11]:
def smooth_ngrams(ngrams, vocab_size):
    smoothed_probs = {}
    for ngram in ngrams:
        count = ngrams.count(ngram)
        smoothed_probs[ngram] = (count + 1) / (vocab_size + len(ngrams))
    return smoothed_probs

In [12]:
vocab = set(unigrams)  # Get vocabulary from unigrams
smoothed_bigrams = smooth_ngrams(bigrams, len(vocab))
smoothed_trigrams = smooth_ngrams(trigrams, len(vocab))

In [14]:
def predict_next_word(prefix, smoothed_bigrams, smoothed_trigrams, vocab):
    # Check if trigram exists
    if prefix in smoothed_trigrams:
        next_word_probs = {word: smoothed_trigrams[f"{prefix} {word}"] for word in vocab}
        return max(next_word_probs, key=next_word_probs.get)  # Get word with highest probability

    # If trigram not found, fallback to bigram
    elif len(prefix.split()) == 2 and prefix in smoothed_bigrams:
        next_word_probs = {word: smoothed_bigrams[f"{prefix} {word}"] for word in vocab}
        return max(next_word_probs, key=next_word_probs.get)  # Get word with highest probability

    # If no match in trigrams or bigrams, return a random word from the vocabulary
    else:
        return random.choice(list(vocab))

In [17]:
# Example usage
prefix = "There is a beautiful mountain and"  # Bigram prefix
predicted_word = predict_next_word(prefix, smoothed_bigrams, smoothed_trigrams, vocab)
print(f"Predicted next word for '{prefix}': {predicted_word}")

Predicted next word for 'There is a beautiful mountain and': background
