<a href="https://colab.research.google.com/github/VD0627/CP/blob/main/rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense
import numpy as np

In [2]:
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

In [6]:
import zipfile

def load_data(file_path):
    """Loads data from a zip file and returns the text content.

    Args:
    file_path (str): The path to the zip file.

    Returns:
    str: The text content of the file within the zip archive.
           Returns an empty string if any error occurs during extraction.
    """
    text = ""
    try:
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            # Assuming there's only one file in the zip, get the first filename
            filename = zip_ref.namelist()[0]
            with zip_ref.open(filename, 'r') as f:
                text = f.read().decode('utf-8') # Decode after reading from zip
    except (zipfile.BadZipFile, IndexError, UnicodeDecodeError) as e:
        print(f"Error reading file: {e}")
    return text

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

In [8]:
tokenizer = Tokenizer(oov_token='<OOV>')

In [14]:
file_path = 'hp.zip'
text = load_data(file_path)

tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0]

In [16]:
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0]
seq_length = 50

In [17]:
for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i - seq_length:i + 1])

In [18]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]

In [19]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [20]:
model = Sequential([
    Embedding(input_dim=total_words, output_dim=64, input_length=seq_length),
    SimpleRNN(256, return_sequences=False),
    Dense(256, activation='relu'),
    Dense(total_words, activation='softmax')
])



In [21]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [22]:
model.fit(X, y, epochs=30, batch_size=128)

Epoch 1/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 148ms/step - accuracy: 0.0218 - loss: 6.5918
Epoch 2/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 107ms/step - accuracy: 0.0453 - loss: 5.8022
Epoch 3/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 107ms/step - accuracy: 0.0432 - loss: 5.7791
Epoch 4/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 150ms/step - accuracy: 0.0435 - loss: 5.7476
Epoch 5/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 106ms/step - accuracy: 0.0590 - loss: 5.7348
Epoch 6/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 107ms/step - accuracy: 0.0844 - loss: 5.5536
Epoch 7/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 116ms/step - accuracy: 0.1180 - loss: 5.2856
Epoch 8/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 147ms/step - accuracy: 0.1664 - loss: 4.9726
Epoch 9/30
[1m30/30[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x792381088790>

In [23]:
def generate_text(seed_text, next_words=50):
    for _ in range(next_words):
        tokenized_input = tokenizer.texts_to_sequences([seed_text])[0]
        tokenized_input = pad_sequences([tokenized_input], maxlen=seq_length, padding='pre')

        predicted_probs = model.predict(tokenized_input, verbose=0)
        predicted_index = np.argmax(predicted_probs)
        predicted_word = tokenizer.index_word.get(predicted_index, "<OOV>")

        seed_text += " " + predicted_word
    return seed_text
print(generate_text("harry looked at"))

harry looked at and car  november vernon dursley male director breeder human  muggle born blue brown dumbledore's army hogwarts school of witchcraft and wizardry pyrotechnics  134 aurora grubbly plank female substitute professor of care of magical law gryffindor unknown unknown human pure blood or half blood lord voldemort   death eaters spreading combat july  1981
