In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense, LSTM, Embedding
import matplotlib.pyplot as plt 
import nltk
from nltk.tokenize import word_tokenize
import re

In [2]:
!pip install pronouncing
!pip install cmudict
import cmudict
import pronouncing

[0m

In [3]:
nltk.download('punkt')

# check if token is a punctuation marker
def is_punct(token):
  for c in token:
    if c.isalpha(): return False
  return True

# remove punctuation and set to lowercase for list of tokens
def clean(tokens):
  return [token.lower() for token in tokens if not is_punct(token)]

# tokenize poem in file
def file_to_tokens(filepath):
  # read in poem from file
  with open(filepath, 'r') as fp:
    poem_lines = fp.readlines()

  if len(poem_lines) < 4: return 0

  # remove byte order marker
  if ord(poem_lines[0][0]) == 65279:
    poem_lines[0] = poem_lines[0][1:]

  # tokenize each line
  return [clean(nltk.tokenize.word_tokenize(line)) for line in poem_lines]

# tokenize list of lines
def lines_to_tokens(lines):
  if ord(lines[0][0]) == 65279:
    lines[0] = lines[0][1:]
  return [clean(nltk.tokenize.word_tokenize(line)) for line in lines]

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
!pip install wordfreq
import wordfreq
from math import log

# given a word, return its log frequency
def get_freq(word):
  freq = wordfreq.word_frequency(word, 'en')
  if freq <= 0: return 0
  return log(freq)

# get avg log frequency
def get_log_freq(toklines):
  # sum up log frequencies of all words in the poem
  total_freq = 0
  word_count = 0
  for line in toklines:
    for word in line:
      total_freq += get_freq(word.lower())
      word_count += 1

  # compute average log frequency and word count
  return (total_freq / word_count, word_count)

Collecting wordfreq
  Downloading wordfreq-3.0.3-py3-none-any.whl (56.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting ftfy>=6.1
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy, wordfreq
Successfully installed ftfy-6.1.1 wordfreq-3.0.3
[0m

In [5]:
# for every poem, obtain a set of unique words; merge all the sets
import functools

# get type-token ratio
def get_tt_ratio(toklines, word_count):
  types = functools.reduce(lambda x, y : set(x).union(set(y)), toklines)

  # compute type-token ratio
  return len(types) / word_count

In [6]:
## preprocessing stuff
def preprocess():
  # get words as phonemes
  phoneme_dict = cmudict.dict()

  # get vowels
  # 1: vowel, 0: consonant
  vowel_dict = dict()
  for phone, group in cmudict.phones():
    vowel_dict[phone] = int(group[0] == 'vowel')
  return phoneme_dict, vowel_dict

In [7]:
## helper functions

# given a word, return the most common pronunciation
def get_phonemes(word, phoneme_dict):
  pronunciations = phoneme_dict[word]
  if len(pronunciations) == 0: return []
  return pronunciations[0]

# given a sequence of phonemes, return the index of the stressed vowel
# if no stressed syllable (e.g. 'a'), return 0
def get_stress_idx(pron):
  for idx, val in enumerate(pron):
    if val[-1] == '1': return idx
  return 0

# given a sequence of phonemes, return the onset if it exists
# otherwise, return the stressed vowel
def get_onset(pron):
  stress_idx = get_stress_idx(pron)
  if stress_idx == 0: return pron[0]
  return pron[stress_idx - 1]

# given a sequence of phonemes, return the rime (part of a syllable; not rhyme)
def get_rime(pron):
  stress_idx = get_stress_idx(pron)
  return pron[stress_idx:]

# given two lines, determine whether they form a perfect rhyme
def is_perfect(a, b, phoneme_dict):
  # get phoneme representations of last two words
  pron_a = get_phonemes(a[-1], phoneme_dict)
  pron_b = get_phonemes(b[-1], phoneme_dict)

  # known word check
  if len(pron_a) == 0 or len(pron_b) == 0: return False

  # onset check
  onset_a = get_onset(pron_a)
  onset_b = get_onset(pron_b)
  if onset_a == onset_b: return False

  # rime check
  rime_a = get_rime(pron_a)
  rime_b = get_rime(pron_b)
  if len(rime_a) != len(rime_b): return False
  for p_a, p_b in zip(rime_a, rime_b):
    if p_a != p_b: return False
  
  # all checks passed
  return True

# given two lines, count the number of slant rhymes
def is_slant(a, b, phoneme_dict):
  # get phoneme representations of last two words
  pron_a = get_phonemes(a[-1], phoneme_dict)
  pron_b = get_phonemes(b[-1], phoneme_dict)

  # known word check
  if len(pron_a) == 0 or len(pron_b) == 0: return False

  # stressed vowel check
  rime_a = get_rime(pron_a)
  rime_b = get_rime(pron_b)
  if rime_a[0] != rime_b[0]: return False

  # rime check
  if len(rime_a) != len(rime_b): return True
  for p_a, p_b in zip(rime_a, rime_b):
    if p_a != p_b: return True
  
  # rime check failed
  return False

# given a window of up to four lines, count the number of perfect and slant rhymes
from itertools import combinations
def count_rhymes_window(lines, phoneme_dict):
  perf, slant = 0, 0
  pairs = list(combinations(lines, 2))
  for a, b in pairs:
    if len(a) == 0 or len(b) == 0: continue
    if is_perfect(a, b, phoneme_dict):
      perf += 1
    if is_slant(a, b, phoneme_dict):
      slant += 1
  return perf, slant

# given all lines of a poem, count the number of perfect and slant rhymes
def count_rhymes(lines, phoneme_dict):
  perf, slant = 0, 0

  # for short poems, use a single window
  if len(lines) < 5: return count_rhymes_window(lines, phoneme_dict)

  # check all 4-line windows
  num_windows, last_window_len = divmod(len(lines), 4)
  for idx in range(num_windows):
    start = 4 * idx
    end = 4 * idx + 4
    new_perf, new_slant = count_rhymes_window(lines[start:end], phoneme_dict)
    perf += new_perf
    slant += new_slant
  
  # last window check
  if last_window_len > 0:
    new_perf, new_slant = count_rhymes_window(lines[4 * num_windows:], phoneme_dict)
    perf += new_perf
    slant += new_slant
  
  # output normalized counts
  return perf, slant

In [8]:
def get_rhyme_counts(toklines, word_count, phoneme_dict):
  perf, slant = count_rhymes(toklines, phoneme_dict)
  perf_freq, slant_freq = perf / word_count, slant / word_count
  return perf_freq, slant_freq

In [9]:
# get initial consonant of a word
def get_initial(word, phoneme_dict):
  phonemes = phoneme_dict[word]
  if len(phonemes) == 0: return ''
  return phonemes[0][0]

# count instances of alliteration
def count_alliteration(lines, word_count, phoneme_dict):
  count = 0
  for line in lines:
    for idx in range(len(line) - 1):
      if get_initial(line[idx], phoneme_dict) == get_initial(line[idx + 1], phoneme_dict): count += 1
  
  # output normalized count
  return count / word_count

In [10]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

# compute sentiment
def get_pos_neg(toklines, word_count):
  sia = SentimentIntensityAnalyzer()
  pos, neg = 0, 0

  for line in toklines:
    for word in line:
      #print(word, sia.polarity_scores(word))
      polarity = sia.polarity_scores(word)
      if polarity['pos'] > 0: pos += 1
      if polarity['neg'] > 0: neg += 1

  # normalize counts
  pos = pos / word_count
  neg = neg / word_count
  return pos, neg

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [11]:
# compute concreteness, abstractness, generalization scores
def compute_cag(toklines, word_count):
  concrete_words = {'tree', 'room', 'thing', 'grass', 'wall',
            'flower', 'glass', 'floor', 'dirt', 'car'}
  abstract_words = {'day', 'night', 'year', 'time', 'death',
              'new', 'morning', 'childhood', 'hour', 'afternoon'}
  general_words = {'all', 'nothing', 'never', 'always', 'every',
            'any', 'anything', 'nobody', 'everything', 'forever'}
  concrete, abstract, general = 0, 0, 0
  for line in toklines:
    for word in line:
      if word in concrete_words: concrete += 1
      if word in abstract_words: abstract += 1
      if word in general_words: general += 1

  # normalize counts
  concrete = concrete / word_count
  abstract = abstract / word_count
  general = general / word_count
  return concrete, abstract, general

In [12]:
def evaluate(toklines):
  # compute desired features
  avg_freq, word_count = get_log_freq(toklines)
  tt_ratio = get_tt_ratio(toklines, word_count)
  phoneme_dict, vowel_dict = preprocess()
  perf_freq, slant_freq = get_rhyme_counts(toklines, word_count, phoneme_dict)
  allit = count_alliteration(toklines, word_count, phoneme_dict)
  pos, neg = get_pos_neg(toklines, word_count)
  concrete, abstract, general = compute_cag(toklines, word_count)

  # coalesce weights and features
  eval_weights = [-0.5039, 0.6646, 0.4602, -2.1, -0.6326,
                  -1.0701, -0.7861, 1.3124, -1.2633, -0.836]
  eval_features = [avg_freq, tt_ratio, perf_freq, slant_freq, allit,
                  pos, neg, concrete, abstract, general]

  # dot product of scores and weights
  eval_score = 0
  for weight, feature in zip(eval_weights, eval_features):
    eval_score += weight * feature
  return eval_score

def evaluate_file(filepath):
  toklines = file_to_tokens(filepath)
  return evaluate(toklines)

def evaluate_poem(lines):
  return evaluate(lines)

In [13]:
model = keras.models.load_model('../input/largemodelv3/large_set_model_v3.h5')

2022-12-05 03:48:29.147636: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-05 03:48:29.250519: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-05 03:48:29.251336: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-05 03:48:29.253428: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [14]:
vocabulary = ['\n', ' ', '!', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '?', 'a',
 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
 't', 'u', 'v', 'w', 'x', 'y', 'z']

In [15]:
tokenizer = {'\n': 0, ' ': 1, '!': 2, ',': 3, '-': 4, '.': 5, '0': 6, '1': 7, '2': 8, '3': 9, '4': 10, '5': 11, '6': 12, '7': 13, '8': 14, '9': 15, '?': 16, 'a': 17, 'b': 18, 'c': 19, 'd': 20, 'e': 21, 'f': 22, 'g': 23, 'h': 24, 'i': 25, 'j': 26, 'k': 27, 'l': 28, 'm': 29, 'n': 30, 'o': 31, 'p': 32, 'q': 33, 'r': 34, 's': 35, 't': 36, 'u': 37, 'v': 38, 'w': 39, 'x': 40, 'y': 41, 'z': 42}

In [34]:
sample = "I "
 # vectorize the string
sample = sample.lower()
#count existing syllables
sample_vector = [tokenizer[char] for char in sample]
predicted = sample_vector
 # convert into tensor of required dimensions
sample_tensor = tf.expand_dims(sample_vector, 0) 
 # broadcast to first dimension to 64 
sample_tensor = tf.repeat(sample_tensor, 64, axis=0)

 # temperature is a sensitive variable to adjust prediction
temperature = .95
poem = sample
char_counter = 0
poem_counter = 0
max_poems = 10
best_poem = ""
best_eval = -100
tot_eval = 0

while poem_counter < max_poems:
     pred = model(sample_tensor)
     pred = pred[0].numpy()/temperature
     pred = tf.random.categorical(pred, num_samples=1)[-1,0].numpy()
     if pred == 0 and char_counter > 500:
        curr_eval = evaluate(poem)
        print(curr_eval)
        tot_eval += curr_eval
        poem = sample
        poem_counter += 1
        char_counter = 0
        sample_vector = [tokenizer[char] for char in sample]
        sample_tensor = tf.expand_dims(sample_vector, 0) 
        sample_tensor = tf.repeat(sample_tensor, 64, axis=0)
     else:
       guess = vocabulary[pred]
       poem += guess
       char_counter += 1
       predicted.append(pred)
       sample_tensor = predicted[-199:]
       sample_tensor = tf.expand_dims([pred],0) 
       sample_tensor = tf.repeat(sample_tensor, 64, axis=0)

print(tot_eval/max_poems)

2.949472836000669
2.860654322042549
2.8722246116917503
2.956060900491604
2.993438865516859
3.0610442743630086
2.92537967716342
2.897661185867009
2.9307046499560077
2.9231281990210896
2.9369769522113964


In [20]:
import pandas as pd
f = pd.read_csv("../input/poetry-foundation-poems/PoetryFoundationData.csv")
#f2 = pd.read_csv("StylusPoems2017-2022.csv")
print(len(f))

13854


In [30]:
nltk.download('punkt')
txt_batches = f['Poem'].values

for i,p in enumerate(txt_batches):
  s = txt_batches[i]
  s = s.replace("\n", " \n ")
  s = re.sub('—', '-', s)
  s = re.sub('[^A-Za-z0-9\n,. !?-]+', '', s)
  s = re.sub('\b\b+', ' ', s)
  s = s.lower()
  s = s.split("\n")
  temp = [""]
  temp = np.array(temp)
  for sent in s:
    temp = np.append(temp, np.array(word_tokenize(sent)))
    temp = np.append(temp, "\n")
  s = temp
  txt_batches[i] = s[1:]

print(txt_batches[1])

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['\n' 'the' 'old' 'cupola' 'glinted' 'above' 'the' 'clouds' ',' 'shone'
 '\n' 'among' 'fir' 'trees' ',' 'but' 'it' 'took' 'him' 'an' 'hour' '\n'
 'for' 'the' 'half' 'mile' 'all' 'the' 'way' 'up' 'the' 'hill' '.' 'as'
 'he' 'trailed' ',' '\n' 'the' 'village' 'passed' 'him' 'by' ',' 'greeted'
 'him' ',' '\n' 'asked' 'about' 'his' 'health' ',' 'but' 'everybody'
 'hurried' '\n' 'to' 'catch' 'the' 'mass' ',' 'left' 'him' 'leaning'
 'against' 'fences' ',' '\n' 'measuring' 'the' 'road' 'with' 'the'
 'walking' 'stick' 'he' 'sculpted' '.' '\n' 'he' 'yearned' 'for' 'the'
 'day' 'when' 'the' 'new' 'church' '\n' 'would' 'be' 'built-right'
 'across' 'the' 'road' '.' 'now' '\n' 'it' 'rises' 'above' 'the' 'moon'
 'saints' 'in' 'frescoes' '\n' 'meet' 'the' 'eye' ',' 'and' 'only' 'the'
 'rain' 'has' 'started' 'to' 'cut' '\n' 'through' 'the' 'shingles' 'on'
 'the' 'roof' 'of' 'his' 'empty' '\n' 'house' '.' 'the' 'apple' 'trees'
 'have' 'taken' 'over' 'the' 'sky' ',' '\n' 'sequestered' 'the' 'gate' ','
 

In [32]:
text = []
print(len(txt_batches))


for b in txt_batches:
    text.append(" ".join(b))
    
print(text[1])

13854

 the old cupola glinted above the clouds , shone 
 among fir trees , but it took him an hour 
 for the half mile all the way up the hill . as he trailed , 
 the village passed him by , greeted him , 
 asked about his health , but everybody hurried 
 to catch the mass , left him leaning against fences , 
 measuring the road with the walking stick he sculpted . 
 he yearned for the day when the new church 
 would be built-right across the road . now 
 it rises above the moon saints in frescoes 
 meet the eye , and only the rain has started to cut 
 through the shingles on the roof of his empty 
 house . the apple trees have taken over the sky , 
 sequestered the gate , sidled over the porch . 
 



In [36]:
tot_eval = 0
for p in text[20:40]:
    curr_eval = curr_eval = evaluate(p[:200])
    print(curr_eval)
    tot_eval+=curr_eval

print(tot_eval/20)

2.766384367089158
3.110730611622347
2.800026116498493
3.0635738883632855
2.6983208369961535
3.1416161908440485
2.7743114602590335
2.734356581539228
2.88971885798861
3.126142669205732
2.81526025115887
2.990900378211726
3.0126146736098494
2.9607866773995113
2.980943770629999
3.067947725596454
3.2547011568345385
3.25323564916435
2.96721057597221
2.851758723672021
2.9630270581327807
