In [61]:
# !pip install tensorflow
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import os

In [62]:
# ==========================================
# CONFIGURATION
# ==========================================
# Path to the dataset downloaded from Kaggle
MAX_VOCAB_SIZE = 500  # Limit vocabulary to speed up training for lab purposes
MAX_SEQUENCE_LEN = 20  # Length of sequence window (sliding window size)
EPOCHS = 10
EMBEDDING_DIM = 50
LSTM_UNITS = 100

In [63]:
def load_data_improved():
    """
    Enhanced data loading with better preprocessing for LSTM training
    """
    try:
        # Direct path to CSV
        csv_path = 'PoetryFoundationData.csv'
        
        print(f"Loading dataset from {csv_path}...")
        df = pd.read_csv(csv_path, encoding='utf-8', on_bad_lines='skip')
        
        # The Poetry Foundation dataset has a 'Poem' column
        if 'Poem' in df.columns:
            # Filter out very short poems (less than 50 characters)
            df = df[df['Poem'].str.len() > 50]
            
            # Take a larger subset for better training (1000-2000 poems)
            corpus = df['Poem'].astype(str).dropna().tolist()[:1000]
            
            # Optional: Clean the text
            corpus = [poem.lower().strip() for poem in corpus]
            
            print(f"Loaded {len(corpus)} poems")
            print(f"Average poem length: {np.mean([len(p) for p in corpus]):.0f} characters")
            
            return corpus
        else:
            raise ValueError("'Poem' column not found in dataset")
            
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

corpus_raw = load_data_improved()

Loading dataset from PoetryFoundationData.csv...
Loaded 1000 poems
Average poem length: 1760 characters


In [64]:


# ==========================================
# 2. DATA PREPROCESSING
# ==========================================
print("Preprocessing data...")

# Tokenization
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)
tokenizer.fit_on_texts(corpus_raw)

total_words = len(tokenizer.word_index) + 1
print(f"Total unique words: {total_words}")


Preprocessing data...
Total unique words: 35176


In [65]:

# Create Input Sequences (Sliding Window)
input_sequences = []
for line in corpus_raw:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad Sequences
# We pad 'pre' (before) so the model learns from the end of the sequence
input_sequences = np.array(pad_sequences(input_sequences, maxlen=MAX_SEQUENCE_LEN, padding='pre'))

# Create Predictors (X) and Label (y)
# X is everything up to the last word, y is the last word
xs, labels = input_sequences[:,:-1], input_sequences[:,-1]

# One-hot encode the labels (Categorical Crossentropy requirement)
# Keep labels as integers (no one-hot encoding needed)
ys = labels  # Just use the integer labels directly

print(f"Input Shape: {xs.shape}")
print(f"Output Shape: {ys.shape}")


Input Shape: (199734, 19)
Output Shape: (199734,)


In [66]:

# ==========================================
# 3. LSTM MODEL DEVELOPMENT
# ==========================================
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input # Add Input here

print("\nBuilding Model...")

model = Sequential()

# Embedding Layer: Input Dim = Vocab Size, Output Dim = 100
# model.add(Embedding(total_words, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LEN-1))
# # NEW WAY (Fixes Warning)
model.add(Input(shape=(MAX_SEQUENCE_LEN-1,))) # Add this line
model.add(Embedding(total_words, EMBEDDING_DIM)) # Remove input_length here

# LSTM Layer: 100 Units
model.add(LSTM(LSTM_UNITS))

# Dropout Layer: 0.2 to prevent overfitting
model.add(Dropout(0.2))

# Dense Output Layer: Softmax activation for word prediction
model.add(Dense(total_words, activation='softmax'))

model.summary()



Building Model...
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 19, 50)            1758800   
                                                                 
 lstm_4 (LSTM)               (None, 100)               60400     
                                                                 
 dropout_4 (Dropout)         (None, 100)               0         
                                                                 
 dense_4 (Dense)             (None, 35176)             3552776   
                                                                 
Total params: 5371976 (20.49 MB)
Trainable params: 5371976 (20.49 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [69]:

# ==========================================
# 4. TRAINING 
# ==========================================
print("\nCompiling and Training...")
# Use sparse_categorical_crossentropy instead of categorical_crossentropy
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(xs, ys, epochs=1, verbose=1)


Compiling and Training...


In [72]:


# ==========================================
# 5. TEXT GENERATION
# ==========================================
def generate_poem(seed_text, next_words, model, max_sequence_len):
    """
    Generates new text based on a seed text.
    """
    generated_text = seed_text

    print(f"\n--- Generating {next_words} words starting with: '{seed_text}' ---")

    for _ in range(next_words):
        # Tokenize the current text
        token_list = tokenizer.texts_to_sequences([seed_text])[0]

        # Pad the sequence (must match training padding)
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

        # Predict probabilities
        predicted_probs = model.predict(token_list, verbose=0)

        # Get the class with highest probability
        predicted_index = np.argmax(predicted_probs, axis=-1)[0]

        # Convert index back to word
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break

        # Append word to seed_text for next iteration
        seed_text += " " + output_word
        generated_text += " " + output_word

    return generated_text


In [None]:

print("--- Generated Poem 1 ---")
print(generate_poem("The woods", 10, model, MAX_SEQUENCE_LEN))

print("\n--- Generated Poem 2 ---")
print(generate_poem("And sorry I", 8, model, MAX_SEQUENCE_LEN))

print("\n--- Generated Poem 3 (Custom) ---")
print(generate_poem("I shall be", 12, model, MAX_SEQUENCE_LEN))

--- Generated Poem 1 ---

--- Generating 10 words starting with: 'The woods' ---
The woods of the of the of the of the of the

--- Generated Poem 2 ---

--- Generating 8 words starting with: 'And sorry I' ---
And sorry I have to the of the of the of

--- Generated Poem 3 (Custom) ---

--- Generating 12 words starting with: 'I shall be' ---
  shall be 
