Convert CSV to TXT

In [None]:
import csv
csv_file = 'four-line-poetry.csv'
txt_file = 'data.txt'
with open(txt_file, "w") as my_output_file:
    with open(csv_file, "r") as my_input_file:
        for row in csv.reader(my_input_file):
            my_output_file.write(row[2] + ":" + row[1] + '\n\n')
    my_output_file.close()

Mount and install packages

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip3 install gpt-2-simple pronouncing g2p-en

%cd drive/MyDrive/poetry/gpt2

Convert samples from TXT to CSV

In [None]:
import csv

scheme = "abba"
prefix = scheme + ":"
file_name = "samples/" + scheme + ".txt"
csv_file_name = "samples/" + scheme + ".csv"

with open(file_name) as f:
    with open(csv_file_name, "w+") as csv_file:
        writer = csv.writer(
            csv_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL
        )
        writer.writerow(["", "poem", "label"])
        lines = f.readlines()
        counter = 0
        for line in lines:
            if line == "\n" or len(line) < 15:
                counter = 0
                continue

            if line.startswith(scheme):
                counter = 1
                line_1 = line.replace(scheme + ":", "")
                continue

            if counter == 1:
                counter = 2
                line_2 = line
                continue

            if counter == 2:
                counter = 3
                line_3 = line
                continue

            if counter == 3:
                counter = 0
                line_4 = line.replace("\n", "")
                writer.writerow(["", line_1 + line_2 + line_3 + line_4, scheme])
                continue


Train GPT2

In [None]:
import gpt_2_simple as gpt2
import os
import requests

model_name = "124M"
if not os.path.isdir(os.path.join("models", model_name)):
	print(f"Downloading {model_name} model...")
	gpt2.download_gpt2(model_name=model_name)   # model is saved into current directory under /models/124M/

file_name = "data.txt"

sess = gpt2.start_tf_sess()

gpt2.finetune(sess,
              file_name,
              model_name=model_name,
              checkpoint_dir="checkpoint",
              batch_size=2,
              accumulate_gradients=32,
              learning_rate=0.001,
              sample_every=50,
              sample_length=200,
              save_every=100,
              steps=400)   # steps is max number of training steps

gpt2.generate(sess)

Generate samples

In [None]:
import gpt_2_simple as gpt2

sess = gpt2.start_tf_sess()
gpt2.load_gpt2(sess)

gpt2.generate(sess, prefix="abba",length=50,sample_delim="\n",nsamples=3000,destination_path="samples/abba.txt")

Loading checkpoint checkpoint/run1/model-1900
INFO:tensorflow:Restoring parameters from checkpoint/run1/model-1900


Evaluate minimum edit distance

In [None]:
import pandas as pd
import numpy as np
import pronouncing
import editdistance
import nltk as nltk
from nltk import word_tokenize
from g2p_en import G2p
from itertools import product
from tqdm import tqdm

# grapheme to phoneme converter
g2p = G2p()
# nltk.download('punkt')

def evaluate(poetry, scheme):
    """
    return the average minimum edit distance of the phonemes of the last words of each line of a four line poem
    :param poetry: string / poem
    :param scheme: string / rhyming scheme (aabb, abab, abba)
    :return: float / average minimum edit distance of the phonemes of the last words
    """

    def get_last_words(x):
        """
        gets last alphabetical words from a list of sentences
        :param x: list of strings / list of sentences
        :return: list of strings / list of words
        """
        try:
            return [[w for w in word_tokenize(l) if w.isalpha()][-1] for l in x]
        except IndexError:
            # if no last word can be found, return an empty string for that line
            result = []
            for l in x:
                try:
                    result.append([w for w in word_tokenize(l) if w.isalpha()][-1])
                except IndexError:
                    result.append('')
            return result

    def min_edit_distance(a, b, n=4):
        """
        calculates minimum edit distance between word a and b based on their possible pronunciations
        :param a: string / word
        :param b: string / word
        :param n: int / number of last phonemes to check, default 4
        :return: float / minimum edit distance based on phonemes
        """
        # get pronunciations
        a_phonemes = pronouncing.phones_for_word(a)
        if not a_phonemes:
            a_phonemes = [' '.join(g2p(a))]
        b_phonemes = pronouncing.phones_for_word(b)
        if not b_phonemes:
            b_phonemes = [' '.join(g2p(b))]

        return min([editdistance.eval(c.split()[-n:], d.split()[-n:]) for c, d in product(a_phonemes, b_phonemes)],
                   default=n)

    last_words = get_last_words(poetry.split('\n'))
    if len(last_words) != 4:
        if len(last_words) > 4:
            last_words = last_words[:4]
        else:
            while len(last_words) < 4:
                last_words.append('')

    if scheme == 'abab':
        return (min_edit_distance(last_words[0], last_words[2]) + min_edit_distance(last_words[1], last_words[3])) / 2
    elif scheme == 'aabb':
        return (min_edit_distance(last_words[0], last_words[1]) + min_edit_distance(last_words[2], last_words[3])) / 2
    elif scheme == 'abba':
        return (min_edit_distance(last_words[0], last_words[3]) + min_edit_distance(last_words[1], last_words[2])) / 2
    else:
        raise ValueError(scheme + ' is an invalid rhyming scheme. This code only works for the literals \"aabb\", '
                                  '\"abab\" or \"abba\".')


# Example on how to evaluate poems with given rhyming schemes
df = pd.read_csv('samples/abba.csv', index_col=0)
metric_vals = []
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    metric_vals.append(evaluate(row.poem, row.label))
print("\n")
print("Average minimum edit distance per poem: ", np.average(metric_vals))