In [1]:
import pinecone
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np 

In [2]:
import kagglehub

path = kagglehub.dataset_download("michaelarman/poemsdataset")
print("Path to dataset files:", path)

Path to dataset files: C:\Users\semih\.cache\kagglehub\datasets\michaelarman\poemsdataset\versions\1


In [3]:
import os

# Path to your dataset directory
dataset_path = path
combined_file = "combined_poetry.txt"
if combined_file is None:  
    # Combine all files
    with open(combined_file, 'w', encoding='utf-8') as outfile:
        for form in os.listdir(dataset_path):
            form_path = os.path.join(dataset_path, form)
            if os.path.isdir(form_path):
                for topic in os.listdir(form_path):
                    topic_path = os.path.join(form_path, topic)
                    if os.path.isdir(topic_path):
                         for filename in os.listdir(topic_path):
                            file_path = os.path.join(topic_path, filename)
                            if os.path.isfile(file_path):
                                with open(file_path, 'r', encoding='utf-8') as infile:
                                    outfile.write(infile.read() + "\n")
print(f"All files combined into {combined_file}")
print(f"The path to the combined file is: {os.path.abspath(combined_file)}")


All files combined into combined_poetry.txt
The path to the combined file is: C:\Users\semih\PycharmProjects\Poetry Generater\combined_poetry.txt


In [4]:
def is_utf8(word):
    try:
        word.encode('utf-8')
        return True
    except UnicodeEncodeError:
        return False
def is_not_number(word):
    return not word.isdigit()

In [10]:
tokenizer = Tokenizer()

with open(combined_file,'r', encoding='utf-8') as file:
    data = file.read()

corpus = data.lower().split("\n")

# Remove the words that are not in utf-8 format
filtered_corpus = [word for word in corpus if is_utf8(word) and is_not_number(word)]

# Take a quarter of the words from the filtered_corpus
quarter_length = len(filtered_corpus) // 50
filtered_corpus = filtered_corpus[:quarter_length]

tokenizer.fit_on_texts(filtered_corpus)
total_words = len(tokenizer.word_index) + 1

print(tokenizer.word_index)
print(total_words)

{'the': 1, 'and': 2, 'of': 3, 'to': 4, 'a': 5, 'in': 6, 'is': 7, 'i': 8, 'you': 9, 'for': 10, 'with': 11, 'that': 12, 'my': 13, 'it': 14, 'as': 15, 'all': 16, 'on': 17, 'are': 18, 'be': 19, 'from': 20, 'your': 21, 'love': 22, 'or': 23, 'me': 24, 'not': 25, 'no': 26, 'by': 27, 'but': 28, 'we': 29, 'one': 30, 'his': 31, 'like': 32, 'this': 33, 'he': 34, 'so': 35, 'was': 36, 'an': 37, 'time': 38, 'will': 39, 'they': 40, 'their': 41, 'there': 42, 'at': 43, 'have': 44, 'her': 45, 'when': 46, 'our': 47, 'life': 48, 'what': 49, 'if': 50, 'can': 51, 'she': 52, 'through': 53, 'each': 54, 'who': 55, 'which': 56, 'out': 57, 'do': 58, 'up': 59, 'only': 60, 'its': 61, 'more': 62, 'see': 63, 'some': 64, 'day': 65, 'then': 66, 'world': 67, 'were': 68, 'never': 69, 'copyright': 70, 'us': 71, 'just': 72, 'now': 73, 'heart': 74, 'may': 75, 'mind': 76, 'every': 77, '©': 78, 'reza': 79, 'muzahidul': 80, 'where': 81, 'them': 82, 'into': 83, 'has': 84, 'too': 85, 'here': 86, 'him': 87, 'how': 88, 'know': 89

In [17]:
from pinecone import Pinecone, ServerlessSpec

# Initialize Pinecone
pc = Pinecone(api_key="pcsk_2AvKN5_hxPeJgDE1s93CGAXda1AFHm8JZTZDqGGz2PE9PS3THkYSykG1UzYcLfe32LXf8")

# List existing indexes
existing_indexes = pc.list_indexes().names()
print("Existing indexes:", existing_indexes)

# Connect to the existing 'poetry' index
index_name = "poetry"
index = pc.Index(index_name)

Existing indexes: ['medical-chatbot', 'poetries', 'poetry']


In [12]:
input_sequences = []

for line in filtered_corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Determine the maximum sequence length
max_sequence_len = max([len(x) for x in input_sequences])

# Process and upsert the data in smaller batches
batch_size = 1000
for start in range(0, len(input_sequences), batch_size):
    end = start + batch_size
    batch_sequences = input_sequences[start:end]
    
    # Pad sequences
    batch_sequences = pad_sequences(batch_sequences, maxlen=max_sequence_len, padding='pre')
    
    # Create predictors and labels
    xs = batch_sequences[:, :-1]
    labels = batch_sequences[:, -1]
    
    # Convert labels to categorical
    ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)
    
# Train the model to create embeddings
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
adam = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
history = model.fit(xs, ys, epochs=20, verbose=1)
    
# Save the trained model
model.save("poetry_generator.h5")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [18]:
# Extract embeddings
embedding_layer = model.layers[0]
embeddings = embedding_layer.get_weights()[0]
    
# Convert xs to list of lists and ensure values are floats
xs_list = xs.astype(float).tolist()

# Upsert the word vectors
vectors = [(str(i), vec) for i, vec in enumerate(xs_list)]
index.upsert(vectors=vectors)

print("Words have been saved to the 'poetry' index in Pinecone.")


Words have been saved to the 'poetry' index in Pinecone.


In [19]:
# Predict the next word
def predict_next_word(seed_text, next_words=1, line_length=10):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_class = np.argmax(predicted, axis=-1)[0]

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_class:
                output_word = word
                break
        seed_text += " " + output_word
        
        # Insert a line break after every 'line_length' words
        if len(seed_text.split()) % line_length == 0:
            seed_text += "\n"
            
    return seed_text

In [24]:
# Generate the next words
seed_text1 = "I've got a bad feeling about this"
seed_text2 = "You are my sunshine, my only sunshine" 
next_words = 100
line_length = 10

generated_text = predict_next_word(seed_text1, next_words, line_length)
print(generated_text)
generated_text = predict_next_word(seed_text2, next_words, line_length)
print(generated_text)

I've got a bad feeling about this serenade survive beat
 the light would show if it could harden into sight
 a fake measure the light would show if it could
 harden your brain down again measure by although this song
 light sway occasional voice carressing stack stack practice stack fling
 lonely wish a few will song a startling voice of
 dull trays were enough are although undone again lose rings
 dyes lies sung light lies will pluck evokes images so
 carressing fake lonely world unknown were turnips – were evokes
 images tinge dawn stalactite light would show her carressing fake
 lonely lonely lonely world sway mind bleary
You are my sunshine, my only sunshine profound on the
 light we borrowed today the light would show if it
 could harden into sight a fake measure the light would
 show if it could harden your brain down again at
 song a different voice before we could system tinge wish
 to reprint a faint color from the heart beat towards
 sunrise evokes lost sunrise ragged hour ra