In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

# PoetryFoundationData

In [2]:
# PoetryFoundationData.csv
df_PoetryFoundationData = pd.read_csv("../data_raw/PoetryFoundationData.csv")

df_PoetryFoundationData.drop(columns=["Title", "Poet", "Unnamed: 0"], inplace=True)

df_PoetryFoundationData.rename(columns={"Poem": "poem", "Tags": "labels"}, inplace=True)

df_PoetryFoundationData = df_PoetryFoundationData[df_PoetryFoundationData["poem"].str.len() > 30]
df_PoetryFoundationData = df_PoetryFoundationData[~df_PoetryFoundationData["poem"].str.contains("Dear Writers")]
df_PoetryFoundationData.reset_index(inplace=True, drop=True)

df_PoetryFoundationData_classification = df_PoetryFoundationData.dropna(ignore_index=True)

In [3]:
df_PoetryFoundationData_classification

Unnamed: 0,poem,labels
0,\r\r\nInvisible fish swim this ghost ocean now...,"Living,Time & Brevity,Relationships,Family & A..."
1,\r\r\nDon’t bother the earth spirit who lives ...,"Religion,The Spiritual,Mythology & Folklore,Fa..."
2,"\r\r\nHour in which I consider hydrangea, a sa...","Living,Parenthood,The Body,The Mind,Nature,Tre..."
3,\r\r\nmy father’s body is a map\r\r\na record ...,"The Body,Family & Ancestors"
4,\r\r\nit has long been forgotten this practice...,"Infancy,Parenthood,The Body"
...,...,...
12841,\r\r\nI \r\r\nThe spider expects the cold of w...,"Nature,Animals,Seas, Rivers, & Streams,Summer,..."
12842,\r\r\n Philosophic\r\r\nin its comple...,"Arts & Sciences,Philosophy"
12843,\r\r\nThe Wise Men will unlearn your name.\r\r...,"Living,Death,Growing Old,Time & Brevity,Nature..."
12844,\r\r\nWe'd like to talk with you about ...,"Living,Social Commentaries,Popular Culture"


In [14]:
def split_and_duplicate(row):
    labels = row['labels'].replace(' & ', ',').split(',')
    labels = [label.lower() for label in labels]
    texts = [row['poem']] * len(labels)
    return pd.DataFrame({'poem': texts, 'labels': labels})
    
print(len(df_PoetryFoundationData_classification))
# Apply the function to the DataFrame
new_df = pd.concat([split_and_duplicate(row) for _, row in df_PoetryFoundationData_classification.iterrows()], ignore_index=True)
new_df = new_df[new_df['labels'] != '']
print(len(new_df))

12846
91806


In [16]:
np.unique(new_df["labels"])

array([' heavens', ' lesbian', ' planets', ' queer', ' rivers',
       'activities', 'ancestors', 'animals', 'anniversary', 'apologies',
       'architecture', 'arts', 'birth', 'birthdays', 'books', 'break-ups',
       'brevity', 'buddhism', 'celebrations', 'christianity', 'christmas',
       'cinco de mayo', 'cities', 'class', 'classic love',
       'coming of age', 'companionship', 'complicated', 'conflict',
       'country life', 'crime', 'crushes', 'dance', 'death', 'design',
       'desire', 'disappointment', 'divorce', 'doubt', 'drinking',
       'easter', 'eating', 'economics', 'enemies', 'engagement',
       'ethnicity', 'failure', 'fairy-tales', 'faith', 'fall', 'family',
       'farewells', "father's day", 'film', 'first love', 'flowers',
       'folklore', 'friends', 'funerals', 'gardening', 'gay', 'gender',
       'get well', 'ghosts', 'god', 'good luck', 'graduation',
       'gratitude', 'greek', 'grieving', 'growing old', 'halloween',
       'hanukkah', 'health', 'heartac

In [21]:
label_dict = {
    "sexuality": [" lesbian", " queer", "gay", "gender"],
    "religion": ["christianity"],
    "romantic": ["valentine's day","vexed love", "weddings","marriage", "first love", "crushes"],
    "nature": ["seas", " rivers"],
    "heaven": [" heavens"],
    "planets": [" planets"],
    "poetry": ["poets"],
    "calendar events": ["christmas", "cinco de mayo", "easter", "halloween", "hanukkah", "kwanzaa", "labor day", "memorial day", "new year", "ramadan", "september 11th", "st. patrick's day", "thanksgiving", "yom kippur", "mother's day", "father's day"],
    
}

In [22]:
def replace_with_key(value):
    for key, values in label_dict.items():
        if value in values:
            return key
    return value

# Apply the function to the DataFrame
new_df['labels'] = new_df['labels'].apply(replace_with_key)
df_PoetryFoundationData = new_df.copy()
np.unique(df_PoetryFoundationData["labels"], return_counts=True)

(array(['activities', 'ancestors', 'animals', 'anniversary', 'apologies',
        'architecture', 'arts', 'birth', 'birthdays', 'books', 'break-ups',
        'brevity', 'buddhism', 'calendar events', 'celebrations', 'cities',
        'class', 'classic love', 'coming of age', 'companionship',
        'complicated', 'conflict', 'country life', 'crime', 'dance',
        'death', 'design', 'desire', 'disappointment', 'divorce', 'doubt',
        'drinking', 'eating', 'economics', 'enemies', 'engagement',
        'ethnicity', 'failure', 'fairy-tales', 'faith', 'fall', 'family',
        'farewells', 'film', 'flowers', 'folklore', 'friends', 'funerals',
        'gardening', 'get well', 'ghosts', 'god', 'good luck',
        'graduation', 'gratitude', 'greek', 'grieving', 'growing old',
        'health', 'heartache', 'heaven', 'heroes', 'history', 'home life',
        'horror', 'humor', 'illness', 'independence day',
        'indoor activities', 'infancy', 'infatuation', 'islam', 'jobs',
       

# TOPICS

In [24]:
topics = os.listdir("../data_raw/topics")
df_topics_list = []
for topic in topics:
    files = os.listdir(f"../data_raw/topics/{topic}")
    df_topic = pd.DataFrame(columns=["poem", "labels"])
    i = 0
    for filename in files:
        with open(f"../data_raw/topics/{topic}/{filename}", encoding="utf8") as f:
            df_topic.loc[i] = {"poem": f.read(), "labels": topic}
        i += 1
    df_topics_list.append(df_topic)
df_topics = pd.concat(df_topics_list, ignore_index=True)

In [25]:
np.unique(df_topics["labels"], return_counts=True)

(array(['alone', 'america', 'angel', 'anger', 'animal', 'baby', 'beach',
        'beautiful', 'beauty', 'believe', 'birth', 'brother', 'butterfly',
        'car', 'carpe diem', 'change', 'chicago', 'childhood', 'children',
        'christmas', 'cinderella', 'city', 'courage', 'crazy', 'culture',
        'dance', 'dark', 'daughter', 'death', 'depression', 'despair',
        'destiny', 'dream', 'evil', 'faith', 'family', 'father', 'fear',
        'fire', 'food', 'football', 'freedom', 'friend', 'frog', 'funeral',
        'funny', 'future', 'girl', 'god', 'graduation', 'greed', 'green',
        'hair', 'happiness', 'happy', 'hate', 'heaven', 'hero', 'home',
        'hope', 'house', 'hunting', 'husband', 'identity', 'innocence',
        'january', 'joy', 'june', 'justice', 'kiss', 'laughter', 'life',
        'lonely', 'loss', 'lost', 'love', 'lust', 'marriage', 'memory',
        'mirror', 'money', 'moon', 'mother', 'murder', 'music', 'nature',
        'night', 'ocean', 'paris', 'passion', 

In [26]:
df_topics[df_topics["labels"] == 'house']["poem"].iloc[1]

"One, from his high bright window in a tower,\nLeans out, as evening falls,\nAnd sees the advancing curtain of the shower\nSplashing its silver on roofs and walls:\nSees how, swift as a shadow, it crosses the city,\nAnd murmurs beyond far walls to the sea,\nLeaving a glimmer of water in the dark canyons,\nAnd silver falling from eave and tree.\nOne, from his high bright window, looking down,\nPeers like a dreamer over the rain-bright town,\nAnd thinks its towers are like a dream.\nThe western windows flame in the sun's last flare,\nPale roofs begin to gleam.\nLooking down from a window high in a wall\nHe sees us all;\nLifting our pallid faces towards the rain,\nSearching the sky, and going our ways again,\nStanding in doorways, waiting under the trees . . .\nThere, in the high bright window he dreams, and sees\nWhat we are blind to,—we who mass and crowd\nFrom wall to wall in the darkening of a cloud.\nThe gulls drift slowly above the city of towers,\nOver the roofs to the darkening se

In [27]:
label_dict = {
    "alone": ["lonely"],
    "war": ["soldier"],
    "animal": ["butterfly","frog", "snake"],
    "nature": ["beach", "ocean", "river", "weather", "rain"],
    "music": ["song"],
    "romantic": ["romance", "marriage", "husband", "wedding", "kiss"],
    "poetry": ["poem"],
    "future": ["destiny", "carpe diem"],
    "home": ["house"],
    "sport": ["football", "swimming", "running" ]
    
}

In [28]:
def replace_with_key(value):
    for key, values in label_dict.items():
        if value in values:
            return key
    return value

# Apply the function to the DataFrame
df_topics['labels'] = df_topics['labels'].apply(replace_with_key)
np.unique(df_topics["labels"], return_counts=True)

(array(['alone', 'america', 'angel', 'anger', 'animal', 'baby',
        'beautiful', 'beauty', 'believe', 'birth', 'brother', 'car',
        'change', 'chicago', 'childhood', 'children', 'christmas',
        'cinderella', 'city', 'courage', 'crazy', 'culture', 'dance',
        'dark', 'daughter', 'death', 'depression', 'despair', 'dream',
        'evil', 'faith', 'family', 'father', 'fear', 'fire', 'food',
        'freedom', 'friend', 'funeral', 'funny', 'future', 'girl', 'god',
        'graduation', 'greed', 'green', 'hair', 'happiness', 'happy',
        'hate', 'heaven', 'hero', 'home', 'hope', 'hunting', 'identity',
        'innocence', 'january', 'joy', 'june', 'justice', 'laughter',
        'life', 'loss', 'lost', 'love', 'lust', 'memory', 'mirror',
        'money', 'moon', 'mother', 'murder', 'music', 'nature', 'night',
        'paris', 'passion', 'peace', 'pink', 'poetry', 'poverty', 'power',
        'racism', 'rainbow', 'red', 'remember', 'respect', 'romantic',
        'rose', 

# MERGE

In [29]:
print(len(df_topics))
print(len(df_PoetryFoundationData))

14335
91806


In [30]:
df_merged = pd.concat([df_topics, df_PoetryFoundationData])
df_merged

Unnamed: 0,poem,labels
0,Though I have watched so many mourners weep\nO...,hope
1,Hope was but a timid friend;\nShe sat without ...,hope
2,All hope lies in tomorrow\nBetrayed by yesterd...,hope
3,Don't give up hope\nDon't give up hope\nThere ...,hope
4,Just remember that hope goes a long way.\nAs l...,hope
...,...,...
92368,\r\r\nWe'd like to talk with you about ...,social commentaries
92369,\r\r\nWe'd like to talk with you about ...,popular culture
92370,\r\r\n Philosophic\r\r\nin its comple...,arts
92371,\r\r\n Philosophic\r\r\nin its comple...,sciences
