In [1]:
import requests

# Download a text file
url = 'https://www.gutenberg.org/files/1342/1342-0.txt'  # Pride and Prejudice by Jane Austen
response = requests.get(url)
text = response.text


In [4]:
text[:500]

'*** START OF THE PROJECT GUTENBERG EBOOK 1342 ***\r\n\r\n\r\n\r\n\r\n                            [Illustration:\r\n\r\n                             GEORGE ALLEN\r\n                               PUBLISHER\r\n\r\n                        156 CHARING CROSS ROAD\r\n                                LONDON\r\n\r\n                             RUSKIN HOUSE\r\n                                   ]\r\n\r\n                            [Illustration:\r\n\r\n               _Reading Janeâ\x80\x99s Letters._      _Chap 34._\r\n                             '

In [5]:
# Preprocess the text
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

tokens = word_tokenize(text.lower())
text = ' '.join(tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
text[:500]

'* * * start of the project gutenberg ebook 1342 * * * [ illustration : george allen publisher 156 charing cross road london ruskin house ] [ illustration : _reading janeâ\x80\x99s letters._ _chap 34._ ] pride . and prejudice by jane austen , with a preface by george saintsbury and illustrations by hugh thomson [ illustration : 1894 ] ruskin 156. charing house . cross road . london george allen . chiswick press : -- charles whittingham and co. tooks court , chancery lane , london . [ illustration : _to'

In [8]:
# Create a mapping of unique characters to integers
chars = sorted(list(set(text)))
char_to_int = {c: i for i, c in enumerate(chars)}
int_to_char = {i: c for i, c in enumerate(chars)}

# Encode the text as integers
encoded_text = [char_to_int[c] for c in text]

In [10]:
encoded_text[:500]

[5,
 0,
 5,
 0,
 5,
 0,
 45,
 46,
 27,
 44,
 46,
 0,
 41,
 32,
 0,
 46,
 34,
 31,
 0,
 42,
 44,
 41,
 36,
 31,
 29,
 46,
 0,
 33,
 47,
 46,
 31,
 40,
 28,
 31,
 44,
 33,
 0,
 31,
 28,
 41,
 41,
 37,
 0,
 11,
 13,
 14,
 12,
 0,
 5,
 0,
 5,
 0,
 5,
 0,
 23,
 0,
 35,
 38,
 38,
 47,
 45,
 46,
 44,
 27,
 46,
 35,
 41,
 40,
 0,
 20,
 0,
 33,
 31,
 41,
 44,
 33,
 31,
 0,
 27,
 38,
 38,
 31,
 40,
 0,
 42,
 47,
 28,
 38,
 35,
 45,
 34,
 31,
 44,
 0,
 11,
 15,
 16,
 0,
 29,
 34,
 27,
 44,
 35,
 40,
 33,
 0,
 29,
 44,
 41,
 45,
 45,
 0,
 44,
 41,
 27,
 30,
 0,
 38,
 41,
 40,
 30,
 41,
 40,
 0,
 44,
 47,
 45,
 37,
 35,
 40,
 0,
 34,
 41,
 47,
 45,
 31,
 0,
 24,
 0,
 23,
 0,
 35,
 38,
 38,
 47,
 45,
 46,
 44,
 27,
 46,
 35,
 41,
 40,
 0,
 20,
 0,
 26,
 44,
 31,
 27,
 30,
 35,
 40,
 33,
 0,
 36,
 27,
 40,
 31,
 65,
 55,
 58,
 45,
 0,
 38,
 31,
 46,
 46,
 31,
 44,
 45,
 8,
 26,
 0,
 26,
 29,
 34,
 27,
 42,
 0,
 13,
 14,
 8,
 26,
 0,
 24,
 0,
 42,
 44,
 35,
 30,
 31,
 0,
 8,
 0,
 27,
 40,
 30,
 0,
 42

In [11]:
# Create input-output pairs
seq_length = 100
dataX = []
dataY = []

for i in range(0, len(encoded_text) - seq_length):
    seq_in = encoded_text[i:i + seq_length]
    seq_out = encoded_text[i + seq_length]
    dataX.append(seq_in)
    dataY.append(seq_out)

n_patterns = len(dataX)

In [14]:
dataX[:500]

[[5,
  0,
  5,
  0,
  5,
  0,
  45,
  46,
  27,
  44,
  46,
  0,
  41,
  32,
  0,
  46,
  34,
  31,
  0,
  42,
  44,
  41,
  36,
  31,
  29,
  46,
  0,
  33,
  47,
  46,
  31,
  40,
  28,
  31,
  44,
  33,
  0,
  31,
  28,
  41,
  41,
  37,
  0,
  11,
  13,
  14,
  12,
  0,
  5,
  0,
  5,
  0,
  5,
  0,
  23,
  0,
  35,
  38,
  38,
  47,
  45,
  46,
  44,
  27,
  46,
  35,
  41,
  40,
  0,
  20,
  0,
  33,
  31,
  41,
  44,
  33,
  31,
  0,
  27,
  38,
  38,
  31,
  40,
  0,
  42,
  47,
  28,
  38,
  35,
  45,
  34,
  31,
  44,
  0,
  11,
  15,
  16,
  0,
  29,
  34],
 [0,
  5,
  0,
  5,
  0,
  45,
  46,
  27,
  44,
  46,
  0,
  41,
  32,
  0,
  46,
  34,
  31,
  0,
  42,
  44,
  41,
  36,
  31,
  29,
  46,
  0,
  33,
  47,
  46,
  31,
  40,
  28,
  31,
  44,
  33,
  0,
  31,
  28,
  41,
  41,
  37,
  0,
  11,
  13,
  14,
  12,
  0,
  5,
  0,
  5,
  0,
  5,
  0,
  23,
  0,
  35,
  38,
  38,
  47,
  45,
  46,
  44,
  27,
  46,
  35,
  41,
  40,
  0,
  20,
  0,
  33,
  31,
  41,
  44,
  

In [15]:
dataY[:500]

[27,
 44,
 35,
 40,
 33,
 0,
 29,
 44,
 41,
 45,
 45,
 0,
 44,
 41,
 27,
 30,
 0,
 38,
 41,
 40,
 30,
 41,
 40,
 0,
 44,
 47,
 45,
 37,
 35,
 40,
 0,
 34,
 41,
 47,
 45,
 31,
 0,
 24,
 0,
 23,
 0,
 35,
 38,
 38,
 47,
 45,
 46,
 44,
 27,
 46,
 35,
 41,
 40,
 0,
 20,
 0,
 26,
 44,
 31,
 27,
 30,
 35,
 40,
 33,
 0,
 36,
 27,
 40,
 31,
 65,
 55,
 58,
 45,
 0,
 38,
 31,
 46,
 46,
 31,
 44,
 45,
 8,
 26,
 0,
 26,
 29,
 34,
 27,
 42,
 0,
 13,
 14,
 8,
 26,
 0,
 24,
 0,
 42,
 44,
 35,
 30,
 31,
 0,
 8,
 0,
 27,
 40,
 30,
 0,
 42,
 44,
 31,
 36,
 47,
 30,
 35,
 29,
 31,
 0,
 28,
 51,
 0,
 36,
 27,
 40,
 31,
 0,
 27,
 47,
 45,
 46,
 31,
 40,
 0,
 6,
 0,
 49,
 35,
 46,
 34,
 0,
 27,
 0,
 42,
 44,
 31,
 32,
 27,
 29,
 31,
 0,
 28,
 51,
 0,
 33,
 31,
 41,
 44,
 33,
 31,
 0,
 45,
 27,
 35,
 40,
 46,
 45,
 28,
 47,
 44,
 51,
 0,
 27,
 40,
 30,
 0,
 35,
 38,
 38,
 47,
 45,
 46,
 44,
 27,
 46,
 35,
 41,
 40,
 45,
 0,
 28,
 51,
 0,
 34,
 47,
 33,
 34,
 0,
 46,
 34,
 41,
 39,
 45,
 41,
 40,
 0,
 23,
 0,


In [16]:
# Reshape and normalize the input
import numpy as np

X = np.reshape(dataX, (n_patterns, seq_length, 1)) / float(len(chars))
y = np.zeros((n_patterns, len(chars)))
for i, pattern in enumerate(dataY):
    y[i, pattern] = 1

In [19]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(len(chars), activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Train the model
model.fit(X, y, epochs=5, batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x2b1414e0f10>

In [23]:
import random

# Pick a random seed
start = np.random.randint(0, len(dataX) - 1)
pattern = dataX[start]

# Generate characters
generated_text = ''.join([int_to_char[value] for value in pattern])
for i in range(10):
    x = np.reshape(pattern, (1, len(pattern), 1)) / float(len(chars))
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = int_to_char[index]
    generated_text += result
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

print(generated_text)

flattering with delicacy . may i ask whether these pleasing attentions proceed from the impulse of the part of


In [24]:
print("Sample Output")
generated_text

Sample Output


'flattering with delicacy . may i ask whether these pleasing attentions proceed from the impulse of the part of'