<a href="https://colab.research.google.com/github/VD0627/CP/blob/main/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

In [6]:
import zipfile

def load_data(file_path):
    """Loads data from a zip file and returns the text content.

    Args:
    file_path (str): The path to the zip file.

    Returns:
    str: The text content of the file within the zip archive.
           Returns an empty string if any error occurs during extraction.
    """
    text = ""
    try:
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            # Assuming there's only one file in the zip, get the first filename
            filename = zip_ref.namelist()[0]
            with zip_ref.open(filename, 'r') as f:
                text = f.read().decode('utf-8') # Decode after reading from zip
    except (zipfile.BadZipFile, IndexError, UnicodeDecodeError) as e:
        print(f"Error reading file: {e}")
    return text

In [12]:
text = load_data('hp.zip')

In [13]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

In [14]:
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0]
seq_length = 50

In [15]:
for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i - seq_length:i + 1])

In [16]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [17]:
model = Sequential([
    Embedding(input_dim=total_words, output_dim=100, input_length=seq_length),
    LSTM(256, return_sequences=True),
    LSTM(256),
    Dense(total_words, activation='softmax')
])



In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X, y, epochs=20, batch_size=128)

Epoch 1/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 989ms/step - accuracy: 0.0360 - loss: 6.6168
Epoch 2/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 999ms/step - accuracy: 0.0476 - loss: 5.8286
Epoch 3/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 1s/step - accuracy: 0.0538 - loss: 5.7651
Epoch 4/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 1s/step - accuracy: 0.0492 - loss: 5.7546
Epoch 5/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 997ms/step - accuracy: 0.0508 - loss: 5.6511
Epoch 6/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 1s/step - accuracy: 0.0643 - loss: 5.4978
Epoch 7/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 987ms/step - accuracy: 0.0961 - loss: 5.2291
Epoch 8/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 1s/step - accuracy: 0.1146 - loss: 5.0366
Epoch 9/20
[1m30/30[0m [32m━━━━━━━━━━━━━━

In [20]:
def generate_text(seed_text, next_words=50, temperature=1.0):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=seq_length, padding='pre')

        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_probs = np.log(predicted_probs) / temperature  # Adjust randomness
        predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))
        predicted_index = np.random.choice(range(len(predicted_probs)), p=predicted_probs)

        output_word = tokenizer.index_word.get(predicted_index, "")
        seed_text += " " + output_word

    return seed_text

In [22]:
print(generate_text("harry looked at", next_words=50, temperature=0.7))

harry looked at magical arts  dragon 1993 magical studies of magic magic dark dragon the office 1992 ravenclaw the arts  dragon 1993 hufflepuff of magic in the department of magical 21 assistant  pre 1970s  arts 60 august 1960 neville durmstrang male student head gryffindor unknown unknown human  pure blood or half blood black grey dumbledore's army
