In [1]:
!pip install gensim nltk tensorflow



In [2]:
import nltk
import requests
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [3]:
import re
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from collections import Counter

# URL of the plain text version of Pride and Prejudice on Project Gutenberg
url = "https://www.gutenberg.org/files/1342/1342-0.txt"
raw_text = requests.get(url).text
start_idx = raw_text.lower().find("chapter 1")
if start_idx == -1:
    start_idx = raw_text.lower().find("chapter i")

end_idx = raw_text.lower().find("*** end of the project gutenberg")

text = raw_text[start_idx:end_idx]

print("Length of raw text:", len(raw_text))

Length of raw text: 743375


In [4]:
# Clean text: lowercase, remove punctuation, numbers
text = text.lower()
text = re.sub(r"[^a-zA-Z\s]", " ", text)     # Keep only letters and spaces
text = re.sub(r'\s+', ' ', text).strip()     # Remove extra spaces

# Tokenize and POS Tag
tokens = word_tokenize(text)
pos_tags = pos_tag(tokens)

# POS Mapper
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [
    lemmatizer.lemmatize(word, get_wordnet_pos(pos))
    for word, pos in pos_tags
]


In [5]:
# ✅ OPTIONAL: Remove rare words
remove_rare = True
if remove_rare:
    word_freq = Counter(lemmatized_tokens)
    lemmatized_tokens = [word for word in lemmatized_tokens if word_freq[word] > 1]

# Final check
print("Total tokens after preprocessing:", len(lemmatized_tokens))
print("Sample:", lemmatized_tokens[:50])

Total tokens after preprocessing: 122501
Sample: ['chapter', 'i', 'he', 'come', 'down', 'to', 'see', 'the', 'place', 'mr', 'and', 'mrs', 'bennet', 'i', 'hope', 'mr', 'bingley', 'will', 'like', 'it', 'i', 'm', 'the', 'tall', 'he', 'ride', 'a', 'black', 'horse', 'when', 'the', 'party', 'enter', 'she', 'be', 'tolerable', 'head', 'to', 'chapter', 'iv', 'head', 'to', 'chapter', 'v', 'without', 'once', 'open', 'his', 'lip', 'to']


In [6]:
!pip install gensim --quiet

In [7]:
from gensim.models import Word2Vec

sequence_length = 20
sentences = [
    lemmatized_tokens[i:i + sequence_length]
    for i in range(0, len(lemmatized_tokens) - sequence_length)
]

w2v_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
    epochs=50
)

w2v_model.save("word2vec_pride.model")

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

text_for_tokenizer = ' '.join(lemmatized_tokens)
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text_for_tokenizer])
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

embedding_matrix.shape

(3130, 100)

In [9]:
token_list = tokenizer.texts_to_sequences([text_for_tokenizer])[0]

sequences = []
seq_length = 20

for i in range(seq_length, len(token_list)):
    seq = token_list[i - seq_length:i + 1]
    sequences.append(seq)

sequences = np.array(sequences)
X, y = sequences[:, :-1], sequences[:, -1]

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

model = Sequential()
model.add(Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=seq_length,
    trainable=True
))
model.add(Bidirectional(LSTM(150, return_sequences=True)))
model.add(Dropout(0.2))
model.add(LSTM(150))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()



In [12]:
history = model.fit(
    X, y,
    epochs=80,
    batch_size=128,
)

Epoch 1/80
[1m957/957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - accuracy: 0.5081 - loss: 1.9945
Epoch 2/80
[1m957/957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.5108 - loss: 1.9821
Epoch 3/80
[1m957/957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.5161 - loss: 1.9639
Epoch 4/80
[1m957/957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 12ms/step - accuracy: 0.5199 - loss: 1.9328
Epoch 5/80
[1m957/957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.5197 - loss: 1.9410
Epoch 6/80
[1m957/957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.5224 - loss: 1.9152
Epoch 7/80
[1m957/957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.5301 - loss: 1.8909
Epoch 8/80
[1m957/957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 12ms/step - accuracy: 0.5295 - loss: 1.8764
Epoch 9/80
[1m957/957[

In [13]:
# Step 8: Save the model
model.save("lstm_text_gen_pride.keras")
print("✅ Model saved.")

✅ Model saved.


In [16]:
import numpy as np

def sample_top_k_with_temperature(preds, k=10, temperature=1.0):
    preds = np.asarray(preds).astype('float64')

    # Get top-k indices
    top_k_indices = preds.argsort()[-k:][::-1]
    top_k_probs = preds[top_k_indices]

    # Apply temperature scaling
    top_k_probs = np.log(top_k_probs + 1e-10) / temperature
    top_k_probs = np.exp(top_k_probs)
    top_k_probs /= np.sum(top_k_probs)  # normalize

    # Sample from the adjusted top-k probabilities
    return np.random.choice(top_k_indices, p=top_k_probs)

def generate_text(prompt, tokenizer, model, seq_length=20, num_words=30, top_k=10, temperature=1.0):
    result = []
    input_text = prompt.lower()

    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([input_text])[0]
        token_list = token_list[-seq_length:]  # ensure it's the correct length
        token_list = np.pad(token_list, (seq_length - len(token_list), 0))
        token_list = token_list.reshape(1, seq_length)

        predicted_probs = model.predict(token_list, verbose=0)[0]

        predicted_id = sample_top_k_with_temperature(predicted_probs, k=top_k, temperature=temperature)

        # Convert index back to word
        next_word = ''
        for word, index in tokenizer.word_index.items():
            if index == predicted_id:
                next_word = word
                break

        input_text += ' ' + next_word
        result.append(next_word)

    return prompt + ' ' + ' '.join(result)


In [17]:
print(generate_text("She was quite", tokenizer, model, top_k=10, temperature=0.8))
print(generate_text("It is a truth universally", tokenizer, model, top_k=10, temperature=1.0))
print(generate_text("My dear Mr. Bennet", tokenizer, model, top_k=10, temperature=1.2))

She was quite forward a little answer as well a the gentleman excite by her own opinion and though her manner be always exactly what a mistress of entertainment be shortly on the
It is a truth universally look out of the walk room she find almost a visit and after some time there be little to love of that when they next be first by these recollection
My dear Mr. Bennet give a doubt they have build up the room and mr collins meanwhile be nothing to say a little in miss lucas the netherfield party before her daughter exclaim out


In [18]:
output = generate_text("She was quite", tokenizer, model, num_words=60, temperature=0.9, top_k=10)
sentences = output.split('.')
print(sentences[0] + '.')

She was quite forward a letter with regret to the sense of the mere ceremonious salutation she address the letter and a earnestly recover do directly but however her spirit have already strike either from him of what have be make inquiry on the subject of mr collins s ready acquiescence would hardly write little ago to his letter and depend on her.
