In [1]:
#import pandas to work with the dataset
import pandas as pd
#import defaultdict to work with the dictonary of all the words in poems
from collections import defaultdict
import random

In [2]:
# the poems dataset is at https://www.kaggle.com/datasets/tgdivy/poetry-foundation-poems
df = pd.read_csv("PoetryFoundationData.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Poem,Poet,Tags
0,0,\r\r\n Objects Used to Prop...,"\r\r\nDog bone, stapler,\r\r\ncribbage board, ...",Michelle Menting,
1,1,\r\r\n The New Church\r\r\n...,"\r\r\nThe old cupola glinted above the clouds,...",Lucia Cherciu,
2,2,\r\r\n Look for Me\r\r\n ...,\r\r\nLook for me under the hood\r\r\nof that ...,Ted Kooser,
3,3,\r\r\n Wild Life\r\r\n ...,"\r\r\nBehind the silo, the Mother Rabbit\r\r\n...",Grace Cavalieri,
4,4,\r\r\n Umbrella\r\r\n ...,\r\r\nWhen I push your button\r\r\nyou fly off...,Connie Wanek,


In [4]:
#no of poems
len(df.Poem.to_list())

13854

In [5]:
poems = df.Poem.to_list()

In [6]:
print(type(poems))

<class 'list'>


In [7]:
#convert list to str with removing digits
poems_str = ''.join(str(e) for e in poems if not e.isdigit())

In [8]:
print(type(poems_str))

<class 'str'>


In [9]:
#buld a markov chain function
def markov_chain(text):
    '''The input is a string of text and the output will be a dictionary with each word as
       a key and each value as the list of words that come after the key in the text.'''
    
    #tokenize the text by word, including punctuation
    words = text.split(' ')
    
    #create an emty dictionary to store all the words and next words
    m_dict = defaultdict(list)
    
    #create a zipped list of all of the word pairs and put them in word: list of next words format
    for current_word, next_word in zip(words[0:-1], words[1:]):
        m_dict[current_word].append(next_word)
    
    #convert the default dictionary to a dictionary
    m_dict = dict(m_dict)
    return m_dict

In [10]:
#create a dictionary of poems in poems_str
poems_dict = markov_chain(poems_str)

In [11]:
#buld a generator function with paramaters (dictionary we created and number of words we want to generate)
def generate_poem(chain, count=50):
    
    #choose the first word randomly 
    #capitalize the first word
    word1 = random.choice(list(chain.keys()))
    sentence = word1.capitalize()
    
    #generate the second word randomly from the value list and repeat 
    for i in range(count-1):
        word2 = random.choice(chain[word1])
        word1 = word2
        sentence += ' ' + word2
    #end with a period
    sentence += '.'
    return(sentence)

In [12]:
#create a function to format the generated poem to get rid of unnessary line breaks and spaces
def format(text):
    text = text.replace("\r\r\r"," ")
    text = text.replace("\r\r"," ")
    text = text.replace("\r"," ")
    text = text.replace("  "," ")
    return text

In [13]:
#generate a poem
poem = generate_poem(poems_dict)

In [14]:
#format the poem
poem = format(poem)
poem

'Bird-shaped block, pure a lovely loves, Each day \nyour thumbs smoothed my old times, I’d be still. \nThe room away, or sink, And I would be known: But mark that purged of some \nprivate Soviet \nUnion you at its knowledge you sleep \nDepartment of ice, to always think of gloom: What I grow \nyoung, my sister.'

In [15]:
#save the poem in a .txt file
with open("sample_poem.txt", "w", encoding='utf8') as f:
    f.write(poem)
    f.close()