In [11]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import bigrams
from nltk.probability import FreqDist, MLEProbDist
from nltk.lm import Vocabulary
import string

In [5]:
# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/souvik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/souvik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/souvik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
with open("Harry_Potter_all_books_preprocessed.txt", "r", encoding="utf-8") as file:
    data = file.read()

In [30]:
# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

In [31]:
# Preprocess the data
tokens = preprocess_text(data)

In [32]:
# Use only the first 10,000 words
tokens = tokens[:10000]
print(tokens[:50])

['boy', 'lived', 'mr', 'mr', 'dursley', 'number', 'four', 'privet', 'drive', 'proud', 'say', 'perfectly', 'normal', 'thank', 'much', '.they', 'last', 'people', 'youd', 'expect', 'involved', 'anything', 'strange', 'mysterious', 'didnt', 'hold', 'nonsense', '.mr', 'dursley', 'director', 'firm', 'called', 'grunnings', 'made', 'drill', '.he', 'big', 'beefy', 'man', 'hardly', 'neck', 'although', 'large', 'mustache', '.mrs', 'dursley', 'thin', 'blonde', 'nearly', 'twice']


In [33]:
from nltk.lm import MLE
from nltk.lm.models import KneserNeyInterpolated

# Create a vocabulary
vocab = Vocabulary(tokens)
# Prepare data for training
train_data, padded_sents = padded_everygram_pipeline(2, tokens)

# Fit MLE bigram model
mle_bigram_model = MLE(2)
mle_bigram_model.fit(train_data, vocab)

# Fit Kneser-Ney bigram model
kn_model = KneserNeyInterpolated(2)
kn_model.fit(train_data, vocab)



In [34]:
# Function to generate text using a language model
def generate_text(model, start_words, max_length=20):
    generated_text = start_words.split()
    try:
        while len(generated_text) < max_length:
            next_word = model.generate(1, text_seed=generated_text)[-1]
            generated_text.append(next_word)
            if next_word == '<s>':  # If the end of the sentence is reached
                break
    except ValueError:
        return "Text generation failed. Insufficient data for given starting words."
    return ' '.join(generated_text)

# Generate text using MLE bigram model starting with "Harry Potter"
generated_text_mle_hp = generate_text(mle_bigram_model, "Harry Potter")
print("Generated text using MLE bigram model starting with 'Harry Potter':")
print(generated_text_mle_hp)

# Generate text using Kneser-Ney bigram model starting with "Harry Potter"
generated_text_kn_hp = generate_text(kn_model, "Harry Potter")
print("\nGenerated text using Kneser-Ney bigram model starting with 'Harry Potter':")
print(generated_text_kn_hp)

# Generate text using MLE bigram model starting with "Dumbledore"
generated_text_mle_dumbledore = generate_text(mle_bigram_model, "Dumbledore")
print("\nGenerated text using MLE bigram model starting with 'Dumbledore':")
print(generated_text_mle_dumbledore)

# Generate text using Kneser-Ney bigram model starting with "Dumbledore"
generated_text_kn_dumbledore = generate_text(kn_model, "Dumbledore")
print("\nGenerated text using Kneser-Ney bigram model starting with 'Dumbledore':")
print(generated_text_kn_dumbledore)

Generated text using MLE bigram model starting with 'Harry Potter':
Harry Potter u > > > > > > > > > > > > > > > > >

Generated text using Kneser-Ney bigram model starting with 'Harry Potter':
Text generation failed. Insufficient data for given starting words.

Generated text using MLE bigram model starting with 'Dumbledore':
Dumbledore > > > > > > > > > > k > > > > > > > >

Generated text using Kneser-Ney bigram model starting with 'Dumbledore':
Text generation failed. Insufficient data for given starting words.


In [35]:
# Import necessary libraries
import nltk
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.lm import MLE, KneserNeyInterpolated
from nltk.lm.preprocessing import padded_everygram_pipeline

# 1. Preprocess and tokenize the dataset using NLTK
# Assuming 'text' is your dataset
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(text.lower())
filtered_text = [w for w in word_tokens if not w in stop_words]

# 2. Fit two bigram language models on the text: MLE and Kneser-Ney discounting
n = 2
train_data, padded_sents = padded_everygram_pipeline(n, filtered_text)

# MLE Model
mle_model = MLE(n)
mle_model.fit(train_data, padded_sents)

# Kneser-Ney Model
kn_model = KneserNeyInterpolated(n)
kn_model.fit(train_data, padded_sents)

# 3. Generate text using both the language models
mle_text = ' '.join(mle_model.generate(20, text_seed=['Harry', 'Potter']))
kn_text = ' '.join(kn_model.generate(20, text_seed=['Dumbledore']))

print("MLE generated text: ", mle_text)
print("Kneser-Ney generated text: ", kn_text)


NameError: name 'text' is not defined