# LSTM Text Generation

In [None]:
import nltk
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense, Embedding, TimeDistributed, Activation
import string

In [None]:
# Check if a GPU is available and enable memory growth
if tf.test.gpu_device_name():
    print('GPU found')
    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
else:
    print('No GPU found')

In [None]:
input_text = open('scifi/internet_archive_scifi_v3.txt', 'r').read()

In [None]:
#subset selection because the whole is large
input_text=(input_text[3187:700000])

In [None]:
# Preprocess the text data
text = input_text.lower().translate(str.maketrans('', '', string.punctuation))
words = text.split()

# Create a dictionary mapping words to integer indices
word_to_index = dict((w, i) for i, w in enumerate(set(words)))
index_to_word = dict((i, w) for i, w in enumerate(set(words)))

In [None]:
seq_length = 10
step = 1
sequences = []
next_words = []
for i in range(0, len(words) - seq_length, step):
    sequences.append(words[i:i + seq_length])
    next_words.append(words[i + seq_length])

X = np.zeros((len(sequences), seq_length, len(word_to_index)), dtype=np.bool)
y = np.zeros((len(next_words), len(word_to_index)), dtype=np.bool)

In [None]:
for i, sequence in enumerate(sequences):
    for j, word in enumerate(sequence):
        X[i, j, word_to_index[word]] = 1
    y[i, word_to_index[next_words[i]]] = 1

In [None]:
# Define LSTM model
model = tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(128, input_shape=(seq_length, len(word_to_index))))
model.add(tf.keras.layers.Dense(len(word_to_index), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# Train LSTM model
model.fit(X, y, epochs=100, batch_size=64)

In [None]:
def generate_text(input_text, prediction_length):
  #Generate text
  seed_text = input_text.lower().translate(str.maketrans('', '', string.punctuation))

  generated_text = seed_text
  for i in range(prediction_length):
      # Convert seed text to integer encoding
      x = np.zeros((1, seq_length, len(word_to_index)), dtype=bool)
      for j, word in enumerate(seed_text.split()):
          x[0, j, word_to_index[word]] = 1
      # Predict next word
      prediction = model.predict(x, verbose=0)[0]
      index = np.argmax(prediction)
      next_word = index_to_word[index]
      
      # Update seed text and generated text
      generated_text += ' ' + next_word
      seed_text = ' '.join(seed_text.split()[1:] + [next_word])
  return(generated_text)

In [None]:
generate_text('In the kitchen he',20)