In [2]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout


In [3]:
# Load the dataset
data = pd.read_csv('Dataset.csv')

# Display the first few rows
data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13854 entries, 0 to 13853
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  13854 non-null  int64 
 1   Title       13854 non-null  object
 2   Poem        13854 non-null  object
 3   Poet        13854 non-null  object
 4   Tags        12899 non-null  object
dtypes: int64(1), object(4)
memory usage: 541.3+ KB


In [4]:
data = data.dropna(subset=['Poem'])
data = data[['Poem']]

# Limit to 5000 poems
data = data.head(1000)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Poem    1000 non-null   object
dtypes: object(1)
memory usage: 7.9+ KB


In [5]:
data["Poem"] = data["Poem"].fillna("").str.replace(r'\r\r\n', ' ', regex=True).str.strip()

# Concatenate all poems into a single text corpus
corpus = "\n".join(data['Poem'].tolist())
print(f"Total characters in reduced corpus: {len(corpus)}")

Total characters in reduced corpus: 1679220


In [6]:
# Clean and preprocess text
corpus = corpus.lower()
corpus = re.sub(r"[^a-zA-Z0-9\s]", "", corpus)

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
total_words = len(tokenizer.word_index) + 1

# Convert text into sequences of words
input_sequences = []
words = corpus.split()
n = 5  # Sequence length
for i in range(n, len(words)):
    sequence = words[i - n:i + 1]
    tokenized_sequence = tokenizer.texts_to_sequences([" ".join(sequence)])[0]
    input_sequences.append(tokenized_sequence)

# Pad sequences to ensure equal lengths
padded_sequences = pad_sequences(input_sequences, padding='pre')

# Split into features (X) and target (y)
X, y = padded_sequences[:, :-1], padded_sequences[:, -1]

# One-hot encode the target variable
y = pd.get_dummies(y).values

print(f"Vocabulary size: {total_words}")
print(f"Input shape: {X.shape}, Target shape: {y.shape}")


Vocabulary size: 30785
Input shape: (290222, 5), Target shape: (290222, 30545)


In [7]:
# Define the model
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=100, input_length=X.shape[1]))
model.add(LSTM(100, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()




In [8]:
# Train the model
history = model.fit(X, y, epochs=10, batch_size=64, verbose=1)


Epoch 1/10


ValueError: Arguments `target` and `output` must have the same shape. Received: target.shape=(None, 30545), output.shape=(None, 30785)

In [None]:
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Generate poetry
seed_text = "the moon rises"
next_words = 20
print(generate_text(seed_text, next_words, model, tokenizer, X.shape[1]))
