In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import numpy as np
import pandas as pd
import nltk
import sklearn
import string

In [6]:
df_path = '/content/drive/MyDrive/poems_dataset.csv'

In [7]:
df = pd.read_csv(df_path, nrows = None)
df.dataframeName = 'poems_dataset.csv'
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')


There are 144123 rows and 6 columns


In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,source,hash
0,0,fishing boats,colors of,the rainbow,tempslibres,FISHINGBOATSCOLORSOFTHERAINBOW
1,1,ash wednesday--,trying to remember,my dream,tempslibres,ASHWEDNESDAYTRYINGTOREMEMBERMYDREAM
2,2,snowy morn--,pouring another cup,of black coffee,tempslibres,SNOWYMORNPOURINGANOTHERCUPOFBLACKCOFFEE
3,3,shortest day,flames dance,in the oven,tempslibres,SHORTESTDAYFLAMESDANCEINTHEOVEN
4,4,haze,half the horse hidden,behind the house,tempslibres,HAZEHALFTHEHORSEHIDDENBEHINDTHEHOUSE


In [9]:

# pre-process, drop the first column
df = df.drop(columns=df.columns[0]) # remove unnecessary column
df.drop(index=df.index[60000:],
        axis=0,
        inplace=True)
df.head(), df.shape

(                 0                      1                 2       source  \
 0    fishing boats              colors of       the rainbow  tempslibres   
 1  ash wednesday--    trying to remember           my dream  tempslibres   
 2     snowy morn--    pouring another cup   of black coffee  tempslibres   
 3     shortest day           flames dance       in the oven  tempslibres   
 4             haze  half the horse hidden  behind the house  tempslibres   
 
                                       hash  
 0           FISHINGBOATSCOLORSOFTHERAINBOW  
 1      ASHWEDNESDAYTRYINGTOREMEMBERMYDREAM  
 2  SNOWYMORNPOURINGANOTHERCUPOFBLACKCOFFEE  
 3          SHORTESTDAYFLAMESDANCEINTHEOVEN  
 4     HAZEHALFTHEHORSEHIDDENBEHINDTHEHOUSE  ,
 (60000, 5))

In [10]:
df.describe

<bound method NDFrame.describe of                           0                             1  \
0             fishing boats                     colors of   
1           ash wednesday--           trying to remember    
2              snowy morn--           pouring another cup   
3              shortest day                  flames dance   
4                      haze         half the horse hidden   
...                     ...                           ...   
59995   I hate shopping for     presents with a passion I   
59996      I really enjoyed   myself this weekend with my   
59997      Have a Happy and    Safe Halloween from all of   
59998  Merry Christmas Rach    Sending all my love to you   
59999    And I'm going back   to school only for the hoes   

                         2       source  \
0              the rainbow  tempslibres   
1                 my dream  tempslibres   
2          of black coffee  tempslibres   
3              in the oven  tempslibres   
4         behind t

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       60000 non-null  object
 1   1       60000 non-null  object
 2   2       59999 non-null  object
 3   source  60000 non-null  object
 4   hash    59999 non-null  object
dtypes: object(5)
memory usage: 2.3+ MB


In [12]:
df.describe

<bound method NDFrame.describe of                           0                             1  \
0             fishing boats                     colors of   
1           ash wednesday--           trying to remember    
2              snowy morn--           pouring another cup   
3              shortest day                  flames dance   
4                      haze         half the horse hidden   
...                     ...                           ...   
59995   I hate shopping for     presents with a passion I   
59996      I really enjoyed   myself this weekend with my   
59997      Have a Happy and    Safe Halloween from all of   
59998  Merry Christmas Rach    Sending all my love to you   
59999    And I'm going back   to school only for the hoes   

                         2       source  \
0              the rainbow  tempslibres   
1                 my dream  tempslibres   
2          of black coffee  tempslibres   
3              in the oven  tempslibres   
4         behind t

In [38]:
from nltk.corpus import cmudict
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [39]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('cmudict')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [40]:
cmu_dictionary =cmudict.dict()
words_not_found_in_cmu_dict = []
len(cmu_dictionary)

123455

In [41]:
# Make haikus lowercase
for columns in df[['0','1','2']]:
    df[columns] = df[columns].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[columns] = df[columns].str.lower()


In [42]:
df.shape

(26327, 5)

In [43]:
def count_syllables_in_word(word):
    # Attempt to find the number of syllables in a word
    # Source: https://datascience.stackexchange.com/questions/23376/how-to-get-the-number-of-syllables-in-a-word
    try:
        # Check the number of syllables in the word using a dictionary
        return [len(list(s for s in phonemes if s[-1].isdigit())) for phonemes in cmu_dictionary[word.lower()]][0]
    except:
        # If the word isn't found in the dictionary, mark it for future reference and return -1
        words_not_found_in_cmu_dict.append(word)
        return -1

def check_syllable_count(poem):
    # Check if a given poem has the correct syllable count per line
    for line_index, sentence in enumerate(poem):
        total_syllables = 0
        try:
            # Count syllables in each word of the sentence
            for word in sentence.split():
                syllables = count_syllables_in_word(word)
                if syllables != -1:
                    total_syllables += count_syllables_in_word(word)
                else:
                    return False  # If a word isn't found in the dictionary, return False
            # Check syllable count based on the line index
            if line_index == 0 or line_index == 2:
                if total_syllables != 5:
                    return False  # For the first and third lines, should have 5 syllables
            elif line_index == 1:
                if total_syllables != 7:
                    return False  # For the second line, should have 7 syllables
                    break
        except:
            return False  # Return False if any error occurs during processing
    return True  # Return True if the poem has the correct syllable count per line


In [44]:
all_poems = df[['0','1','2']].to_numpy()
all_poems.shape

(26327, 3)

In [45]:
# Remove poems that do not follow 5-7-5 syllable structure
df = df[[check_syllable_count(poem) for poem in all_poems]]
df.head(), df.shape

(                           0                             1  \
 24       visiting the graves     stronger the october wind   
 141  profound blue of night    the resin and salt of pines   
 142  scattered in the ditch   like tiny scraps of blue sky   
 343   the smell of her hands     on the neck of the bottle   
 435       christmas services    a cellular phone rings out   
 
                        2       source  \
 24   at my grandparents'  tempslibres   
 141  so far from the sea  tempslibres   
 142  bits of plastic bag  tempslibres   
 343    drinking greedily  tempslibres   
 435     handel's messiah  tempslibres   
 
                                                   hash  
 24   VISITINGTHEGRAVESSTRONGERTHEOCTOBERWINDATMYGRA...  
 141  PROFOUNDBLUEOFNIGHTTHERESINANDSALTOFPINESSOFAR...  
 142  SCATTEREDINTHEDITCHLIKETINYSCRAPSOFBLUESKYBITS...  
 343  THESMELLOFHERHANDSONTHENECKOFTHEBOTTLEDRINKING...  
 435  CHRISTMASSERVICESACELLULARPHONERINGSOUTHANDELS...  ,
 (26327, 5))

In [46]:
# Preprocessing the text data
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
processed_text = []

for i in range(len(df)):
    processed_line = []
    for col in df.columns[:-1]:  # Exclude the 'Source' column
        # Tokenize, remove stopwords, and apply stemming to each word
        words = word_tokenize(df[col][i].lower())
        words = [ps.stem(word) for word in words if word.isalpha() and word not in stop_words]
        processed_line.extend(words)
    processed_text.append(processed_line)

# Visualizing word frequency
all_words = [word for line in processed_text for word in line]
freq_dist = FreqDist(all_words)
freq_dist.plot(20, cumulative=False)
plt.show()

KeyError: ignored