# Student Name: Qadeer Hussain

# Student ID: C00270632

# Topic: Recurrent Neural Network (RNN) 

# Last Modified: 21/03/2025

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense




# Load the data

Link to data: https://www.kaggle.com/datasets/johnhallman/complete-poetryfoundationorg-dataset

In [2]:
# Load the CSV file
data = pd.read_csv('kaggle_poem_dataset.csv')

# Explore the dataset

In [3]:
print("Dataset:", data.shape)
print("Column Names:", data.columns)

# Display a few sample rows
data.head(5)

Dataset: (15652, 5)
Column Names: Index(['Unnamed: 0', 'Author', 'Title', 'Poetry Foundation ID', 'Content'], dtype='object')


Unnamed: 0.1,Unnamed: 0,Author,Title,Poetry Foundation ID,Content
0,0,Wendy Videlock,!,55489,"Dear Writers, I’m compiling the first in what ..."
1,1,Hailey Leithauser,0,41729,"Philosophic\nin its complex, ovoid emptiness,\..."
2,2,Jody Gladding,1-800-FEAR,57135,We'd like to talk with you about fear t...
3,3,Joseph Brodsky,1 January 1965,56736,The Wise Men will unlearn your name.\nAbove yo...
4,4,Ted Berrigan,3 Pages,51624,For Jack Collom\n10 Things I do Every Day\n\np...


In [4]:
print(data.isnull().sum()) 

Unnamed: 0              0
Author                  0
Title                   1
Poetry Foundation ID    0
Content                 0
dtype: int64


In [5]:
data = data[['Content']].dropna()

# Data Preprocssing

In [6]:
# Convert to lower case and join the content into one.
text = " ".join(data['Content'].tolist()).lower()

# # Strip any whitepaces 
# text = re.sub(r'\s+', ' ', text).strip()

In [7]:
# Print sample after cleaning
print("Sample text:\n", text[:1000])
print("\nTotal Characters in Corpus:", len(text))

Sample text:
 dear writers, i’m compiling the first in what i hope is a series of publications i’m calling artists among artists. the theme for issue 1 is “faggot dinosaur.” i hope to hear from you! thank you and best wishes. philosophic
in its complex, ovoid emptiness,
a skillful pundit coined it as a sort
of stopgap doorstop for those
quaint equations

romans never
dreamt of. in form completely clever
and discrete—a mirror come unsilvered,
loose watch face without the works,
a hollowed globe

from tip to toe
unbroken, it evades the grappling
hooks of mass, tilts the thin rim of no thing,
remains embryonic sum,
non-cogito. we'd  like  to  talk  with  you  about  fear they  said  so
many  people  live  in  fear  these  days  they  drove  up
all  four  of  them  in  a  small  car nice   boy  they  said
beautiful  dogs they  said  so  friendly  the  man  ahead
of  the  woman  the other  two  waiting  in  the  drive  i
was  outside digging up the garden no one home i said
what   are  you 

In [8]:
# Get unique characters in the dataset
chars = sorted(set(text)) 

# Creating mappings
char_to_index = {char: idx for idx, char in enumerate(chars)}
index_to_char = {idx: char for idx, char in enumerate(chars)}

In [9]:
print(f"Unique characters: {len(chars)}")
print(f"Sample character mappings: {list(char_to_index.items())[:100]}")

Unique characters: 249
Sample character mappings: [('\n', 0), (' ', 1), ('!', 2), ('"', 3), ('#', 4), ('$', 5), ('%', 6), ('&', 7), ("'", 8), ('(', 9), (')', 10), ('*', 11), ('+', 12), (',', 13), ('-', 14), ('.', 15), ('/', 16), ('0', 17), ('1', 18), ('2', 19), ('3', 20), ('4', 21), ('5', 22), ('6', 23), ('7', 24), ('8', 25), ('9', 26), (':', 27), (';', 28), ('=', 29), ('?', 30), ('@', 31), ('[', 32), ('\\', 33), (']', 34), ('^', 35), ('_', 36), ('`', 37), ('a', 38), ('b', 39), ('c', 40), ('d', 41), ('e', 42), ('f', 43), ('g', 44), ('h', 45), ('i', 46), ('j', 47), ('k', 48), ('l', 49), ('m', 50), ('n', 51), ('o', 52), ('p', 53), ('q', 54), ('r', 55), ('s', 56), ('t', 57), ('u', 58), ('v', 59), ('w', 60), ('x', 61), ('y', 62), ('z', 63), ('{', 64), ('|', 65), ('}', 66), ('~', 67), ('\xa0', 68), ('¡', 69), ('¢', 70), ('£', 71), ('¤', 72), ('§', 73), ('«', 74), ('®', 75), ('°', 76), ('´', 77), ('·', 78), ('»', 79), ('¼', 80), ('½', 81), ('¿', 82), ('×', 83), ('ß', 84), ('à', 85), ('á', 86

In [10]:
SEQ_LENGTH = 100
SUBSET_SIZE = 100000

In [11]:
inputs = []
outputs = []

for i in range(len(text) - SEQ_LENGTH):
    inputs.append(text[i : i + SEQ_LENGTH])
    outputs.append(text[i + SEQ_LENGTH])

print(f"Total sequences: {len(inputs)}")

Total sequences: 22286692


In [12]:
subset_size = min(SUBSET_SIZE, len(inputs))

In [13]:
# Convert text to numerical format
X = np.array([[char_to_index[char] for char in seq] for seq in inputs[:SUBSET_SIZE]])
y = np.array([char_to_index[char] for char in outputs[:SUBSET_SIZE]])

In [14]:
print(f"Using {SUBSET_SIZE} sequences for training")
print(f"X shape: {X.shape}, y shape: {y.shape}")

Using 100000 sequences for training
X shape: (100000, 100), y shape: (100000,)


In [15]:
X = to_categorical(X, num_classes=len(chars))
y = to_categorical(y, num_classes=len(chars))

In [16]:
print(f"New x shape (one-hot encoded): {X.shape}")
print(f"New y shape (one-hot encoded): {y.shape}")

New x shape (one-hot encoded): (100000, 100, 249)
New y shape (one-hot encoded): (100000, 249)


# Build the model

In [17]:
# Define LSTM model
model = Sequential([
    LSTM(256, input_shape=(SEQ_LENGTH, len(chars)), return_sequences=True),
    LSTM(256),
    Dense(len(chars), activation="softmax")
])

# Compile model
model.compile(loss="categorical_crossentropy", optimizer="adam")

# Print summary
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 100, 256)          518144    
                                                                 
 lstm_1 (LSTM)               (None, 256)               525312    
                                                                 
 dense (Dense)               (None, 249)               63993     
                                                                 
Total params: 1107449 (4.22 MB)
Trainable params: 1107449 (4.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# Train the model

In [18]:
EPOCHS = 5
BATCH_SIZE = 128

In [19]:
history = model.fit(
    X, 
    y, 
    batch_size=BATCH_SIZE, 
    epochs=EPOCHS
)

Epoch 1/5

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
