In [288]:
import pandas as pd
import nltk
from collections import Counter
import re

# Dataset

### Importing it

In [289]:
data = pd.read_csv("G:\OneDrive - University of Edinburgh\Poem Generation\WebScrapping-PoetryFoundation\PoetryFoundationData.csv")
data = data.drop(columns=["Unnamed: 0"])
data.head(5)

Unnamed: 0,Title,Poem,Poet,Tags
0,\r\r\n Objects Used to Prop...,"\r\r\nDog bone, stapler,\r\r\ncribbage board, ...",Michelle Menting,
1,\r\r\n The New Church\r\r\n...,"\r\r\nThe old cupola glinted above the clouds,...",Lucia Cherciu,
2,\r\r\n Look for Me\r\r\n ...,\r\r\nLook for me under the hood\r\r\nof that ...,Ted Kooser,
3,\r\r\n Wild Life\r\r\n ...,"\r\r\nBehind the silo, the Mother Rabbit\r\r\n...",Grace Cavalieri,
4,\r\r\n Umbrella\r\r\n ...,\r\r\nWhen I push your button\r\r\nyou fly off...,Connie Wanek,


### Cleaning the dataset

In [290]:
def clean_text(string):
    string  = str(string).upper()
    string  = string.replace("-", " ")
    string  = string.replace("—", " ")
    string  = string.replace(".", " . ")
    return str(string).replace("\r\r\n"," NEWLINE\n ")

data = data.applymap(clean_text)

poems = data.Poem.map(lambda x: nltk.tokenize.word_tokenize(x,preserve_line =True))
#For Faster Testing
poem_sample = poems[:2000]

### Adding Syllables Count

Importing the Syllables dictionary

In [294]:
syllables = pd.read_csv("Syllable dict.txt","  ", engine="python")
syllables = syllables.rename(columns = {"A":"words","AH0":"syllable_count"})
syllables.head()

Unnamed: 0,words,syllable_count
0,A(1),EY1
1,A'S,EY1 Z
2,A.,EY1
3,A.'S,EY1 Z
4,A.S,EY1 Z


Building the Syllables dictionary

In [295]:
def number_of_syllables(string):
    count = 0
    for ch in string:
        if(ch in "012"):
            count=count+1
    return(count)

syllables.syllable_count = (syllables.syllable_count.map(number_of_syllables))
syllable_dict = dict([(word, syllable)for word,syllable in zip(Syllables.words,Syllables.syllable)])
#Some other basic additions to it manually
syllable_dict["A"] = 1
syllable_dict["NEWLINE"] = "NEWLINE"
syllable_dict[","] = "punc"
syllable_dict["."] = "punc"

### Tagging the poems with the syllables dictionary

Tags the words present in the dictionary and returns the words not present in it

In [296]:
def tag_syllables(poem, untagged):
    tagged_poem = []
    for token in poem:
        if(token in syllable_dict):
            tag = syllable_dict[token]
            tagged_poem = tagged_poem + [(token,tag)] 
        else:
            untagged += [token]
    return(tagged_poem, untagged)

### The missing words problem

In [300]:
b = []
for i in Poems:
    a,b = tag_syllables(i,b)
#set(b)
len(set(b))

63016

### Using CMU Building to build more dictionaries

In [None]:
File_object = open("missing_words.txt","w")
c = []
for word in (list(set(b))[40000:]):
    try:
        File_object.writelines(word+"\n")
    except Exception as e:
        c = c+[word]
File_object.close()

### Using the dictionaries to find the counts

In [322]:
dict1 = pd.read_csv("dict1.txt","\t", engine="python")
dict1 = dict1.rename(columns={"SCYTHING":"words","S IH DH AH NG":"syllable_count"})
dict2 = pd.read_csv("dict2.txt","\t", engine="python")
dict2 = dict2.rename(columns={"TREATERS/":"words","T R IY T ER Z":"syllable_count"})
dict3 = pd.read_csv("dict3.txt","\t", engine="python")
dict3 = dict3.rename(columns={"NOMMO":"words","N AA M OW":"syllable_count"})
dictionary = (dict1.append(dict2)).append(dict3)
dictionary.head(5)

Unnamed: 0,words,syllable_count
0,YNOUGH,Y N UW
1,AGATHAS,AE G AH TH AH Z
2,ARETO,AE R AH T OW
3,UNSOUNDED,AH N S AW N D IH D
4,NIGHTIESMALL,N AY T AY S M AO L


Importing the phones to identify syllables

In [326]:
phones = pd.read_csv("phones.txt","\t", engine="python")
phones = phones.rename(columns={"AA":"symbol","vowel":"meaning"})
phones = phones[phones["meaning"]=="vowel"]
vowels = list(phones.symbol)
vowels = vowels + ["AA"]
print(vowels)

['AE', 'AH', 'AO', 'AW', 'AY', 'EH', 'ER', 'EY', 'IH', 'IY', 'OW', 'OY', 'UH', 'UW', 'AA']


In [338]:
def syllables_count(string):
    l = str(string).split(" ")
    count = 0
    for a in l:
        if(a in vowels):
            count+=1
    return count
dictionary.syllable_count = dictionary.syllable_count.map(syllables_count)

In [345]:
dict_database = syllables.append(dictionary)

In [348]:
syllable_dict = dict([(word, syllable)for word,syllable in zip(dict_database.words,dict_database.syllable_count)])
#Some other basic additions to it manually
syllable_dict["A"] = 1
syllable_dict["NEWLINE"] = "NEWLINE"
syllable_dict[","] = "punc"
syllable_dict["."] = "punc"

### Tagging poems with the updated dictionary!

In [367]:
def syllables_poem(poem, untagged):
    syllable_poem = []
    for token in poem:
        if(token in syllable_dict):
            tag = syllable_dict[token]
            syllable_poem = syllable_poem + [tag] 
        else:
            untagged += [token]
    return(syllable_poem, untagged)

untagged = []
syllable_poems = []
for i in Poems:
    temp,untagged = syllables_poem(i,untagged)
    syllable_poems = syllable_poems+[temp]
#set(b)
len(set(untagged))

12584

In [414]:
from Models import nGrams as n
import Models
a = Models.nGrams()

In [411]:
dicBi = {}
for i in syllable_poems:
    dicBi = n.BiGram(dicBi, i)

dicTri = {}
for i in syllable_poems:
    dicTri = n.TriGram(dicTri, i)

dic4 = {}
for i in syllable_poems:
    dic4 = a.NGram(dic4, i, 4)

In [427]:
prev_word1 = 2
prev_word2 = 2
prev_word3 = 2
prev_word4 = 3
prev_word5 = 2
prev_word6 = 1
prev_word7 = 3

generate_str = str(prev_word1)+" "+str(prev_word2) + " " +str(prev_word3) + " "+ str(prev_word4) +" "+str(prev_word5) + " " +str(prev_word6) + " " +str(prev_word7)

for i in range(50):
    next_word  = n.next_word([prev_word1, prev_word2,prev_word3,prev_word4, prev_word5,prev_word6,prev_word7], dic8, 8)
    prev_word1 = prev_word2
    prev_word2 = prev_word3
    prev_word3 = prev_word4
    prev_word4 = prev_word5
    prev_word5 = prev_word6
    prev_word6 = prev_word7
    prev_word7 = next_word
    generate_str = generate_str+" "+ str(next_word)

print(generate_str)

2 2 2 3 2 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


In [418]:
dic8 = {}
for i in syllable_poems:
    dic8 = a.NGram(dic8, i, 8)