In [1]:
import os
import sys
import numpy as np
print(os.getcwd())
print(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.getcwd()))
import pandas as pd
import epitran
from datasets import load_dataset, DatasetDict, load_from_disk
from functools import lru_cache
from sklearn.model_selection import train_test_split

/home/toure215/BERT_phonetic/test
/home/toure215/BERT_phonetic


In [2]:
dataset = load_dataset("csv" , data_files="/home/toure215/BERT_phonetic/DATASETS/verses.csv")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'Verse', 'Meter', 'char_count'],
        num_rows: 199002
    })
})

In [4]:
pd_dataset = pd.read_csv("/home/toure215/BERT_phonetic/DATASETS/verses.csv")
pd_dataset.rename(columns={"Verse":"text"}, inplace=True)
pd_dataset.head()

Unnamed: 0,id,text,Meter,char_count
0,0,ah why this boding start this sudden pain,iambic,6
1,1,that wings my pulse and shoots from vein to vein,iambic,6
2,2,what mean regardless of yon midnight bell,iambic,6
3,3,these earthborn visions saddening o'er my cell,iambic,6
4,4,what strange disorder prompts these thoughts t...,iambic,6


In [5]:
pd_dataset.iloc[1]["text"]

'that wings my pulse and shoots from vein to vein'

In [6]:
verse_dataset = pd.DataFrame(columns=["id", "sentence1", "sentence2", "label"])
verse_dataset

Unnamed: 0,id,sentence1,sentence2,label


In [7]:
epi = epitran.Epitran("eng-Latn")
@lru_cache(maxsize=None)
def cached_xsampa_list(word):
    return epi.xsampa_list(word)

def is_rhyming(word1, word2):
    sound1 = cached_xsampa_list(word1)
    sound2 = cached_xsampa_list(word2)
    if len(sound1) < 2 or len(sound2) < 2:
        return False
    return sound1[-2:] == sound2[-2:]

# Pre-compute phonetic endings for all verses
def get_last_phonetic(word):
    phonemes = cached_xsampa_list(word)
    return phonemes[-2:] if len(phonemes) >= 2 else phonemes

In [8]:
pd_dataset["last_word"] = pd_dataset["text"].apply(lambda x: x.split()[-1])
pd_dataset["phonetic_ending"] = pd_dataset["last_word"].apply(get_last_phonetic)
pd_dataset.head()

Unnamed: 0,id,text,Meter,char_count,last_word,phonetic_ending
0,0,ah why this boding start this sudden pain,iambic,6,pain,"[j, n]"
1,1,that wings my pulse and shoots from vein to vein,iambic,6,vein,"[j, n]"
2,2,what mean regardless of yon midnight bell,iambic,6,bell,"[E, l]"
3,3,these earthborn visions saddening o'er my cell,iambic,6,cell,"[E, l]"
4,4,what strange disorder prompts these thoughts t...,iambic,6,glow,"[o, w]"


In [9]:
# Convert phonetic_ending lists to tuples for hashing
pd_dataset["phonetic_ending"] = pd_dataset["phonetic_ending"].apply(tuple)

# Group verses by their phonetic endings for quick access to rhyming pairs
rhyme_groups = pd_dataset.groupby("phonetic_ending").apply(lambda x: x.index.tolist()).to_dict()
rhyme_groups

  rhyme_groups = pd_dataset.groupby("phonetic_ending").apply(lambda x: x.index.tolist()).to_dict()


{(): [8862,
  8996,
  15446,
  29059,
  29083,
  29191,
  29207,
  29886,
  47921,
  73438,
  77063,
  77113,
  79288,
  79294,
  79300,
  85532,
  85570,
  112891,
  114416,
  114508,
  114567,
  145263,
  158903,
  176984],
 ('@',): [102072],
 ('@', 'N'): [23706, 126128, 144654, 193234, 195689],
 ('@', 'T'): [15700, 194226, 197641],
 ('@', 'b'): [79275, 127325, 150168, 166496, 197785],
 ('@', 'd'): [392,
  1216,
  1859,
  1881,
  1885,
  1887,
  2197,
  2200,
  4803,
  8476,
  9058,
  9059,
  9060,
  9310,
  9311,
  14686,
  17483,
  18849,
  20147,
  20718,
  22760,
  24253,
  25019,
  25507,
  26602,
  27190,
  28418,
  28637,
  28752,
  28784,
  29036,
  30948,
  30949,
  30954,
  30955,
  32090,
  32897,
  33641,
  33746,
  33747,
  33751,
  34984,
  38296,
  38297,
  39482,
  40883,
  41118,
  42348,
  43839,
  43840,
  44114,
  44151,
  44244,
  44516,
  44517,
  44803,
  45484,
  45492,
  45502,
  45504,
  45538,
  45540,
  45588,
  45641,
  45678,
  45755,
  47884,
  48092,
 

In [10]:
# Create a list of all unique phonetic endings for non-rhyming selection
phonetic_endings = list(rhyme_groups.keys())
print(phonetic_endings)

[(), ('@',), ('@', 'N'), ('@', 'T'), ('@', 'b'), ('@', 'd'), ('@', 'dZ'), ('@', 'f'), ('@', 'g'), ('@', 'k'), ('@', 'l'), ('@', 'm'), ('@', 'n'), ('@', 'p'), ('@', 'r\\='), ('@', 's'), ('@', 't'), ('@', 'z'), ('A',), ('A', '@'), ('A', 'N'), ('A', 'S'), ('A', 'T'), ('A', 'b'), ('A', 'd'), ('A', 'dZ'), ('A', 'f'), ('A', 'g'), ('A', 'i'), ('A', 'k'), ('A', 'l'), ('A', 'm'), ('A', 'n'), ('A', 'p'), ('A', 'r\\'), ('A', 'r\\='), ('A', 's'), ('A', 't'), ('A', 'tS'), ('A', 'v'), ('A', 'z'), ('D', 'd'), ('D', 'i'), ('D', 'r\\='), ('D', 'z'), ('E',), ('E', 'D'), ('E', 'S'), ('E', 'T'), ('E', 'b'), ('E', 'd'), ('E', 'dZ'), ('E', 'f'), ('E', 'g'), ('E', 'i'), ('E', 'k'), ('E', 'l'), ('E', 'm'), ('E', 'n'), ('E', 'p'), ('E', 'r\\'), ('E', 'r\\='), ('E', 's'), ('E', 't'), ('E', 'tS'), ('E', 'z'), ('I', 'D'), ('I', 'N'), ('I', 'S'), ('I', 'T'), ('I', 'b'), ('I', 'd'), ('I', 'dZ'), ('I', 'f'), ('I', 'g'), ('I', 'k'), ('I', 'l'), ('I', 'm'), ('I', 'n'), ('I', 'p'), ('I', 'r\\'), ('I', 's'), ('I', 't'),

In [11]:
# Build the dataset
import random

for i in range(0, len(pd_dataset), 2):
    last = len(verse_dataset)
    word1 = pd_dataset.iloc[i]["last_word"]
    phonetic1 = pd_dataset.iloc[i]["phonetic_ending"]
    
    # Find a rhyming pair
    rhyming_indices = rhyme_groups.get(phonetic1, [])
    rhyming_idx = i  # Default to self if no other rhyme is found
    for idx in rhyming_indices:
        if idx != i:
            rhyming_idx = idx
            break

    verse_dataset.loc[last] = [
        last,
        pd_dataset.iloc[i]["text"],
        pd_dataset.iloc[rhyming_idx]["text"],
        1,  # Label for rhyming
    ]

    # Find a non-rhyming pair by selecting from different phonetic endings
    non_rhyme_phonetic = phonetic1
    while non_rhyme_phonetic == phonetic1:
        non_rhyme_phonetic = random.choice(phonetic_endings)
    non_rhyme_idx = np.random.choice(rhyme_groups[non_rhyme_phonetic])

    verse_dataset.loc[last + 1] = [
        last + 1,
        pd_dataset.iloc[i]["text"],
        pd_dataset.iloc[non_rhyme_idx]["text"],
        0,  # Label for non-rhyming
    ]

print("Final row count in verse_dataset:", len(verse_dataset))


Final row count in verse_dataset: 199002


In [12]:
print(len(verse_dataset))
print(len(pd_dataset))
verse_dataset.head(20)

199002
199002


Unnamed: 0,id,sentence1,sentence2,label
0,0,ah why this boding start this sudden pain,that wings my pulse and shoots from vein to vein,1
1,1,ah why this boding start this sudden pain,those parts of thee that the worlds eye doth view,0
2,2,what mean regardless of yon midnight bell,these earthborn visions saddening o'er my cell,1
3,3,what mean regardless of yon midnight bell,to save their matrons from the brutal rape,0
4,4,what strange disorder prompts these thoughts t...,these sighs to murmur and these tears to flow,1
5,5,what strange disorder prompts these thoughts t...,mahng the loon the wildgoose wawa,0
6,6,'tis she 'tis eloisa's form restor'd,strike the soft sweet harmonic chord,1
7,7,'tis she 'tis eloisa's form restor'd,when they talk'd of their raphaels corregios a...,0
8,8,she comes in all her killing charms confest,glares thro' the gloom and pours upon my breast,1
9,9,she comes in all her killing charms confest,sure ruin so her disembowel'd web,0


In [13]:
print("Number of unique rows:", verse_dataset[["sentence1", "sentence2", "label"]].drop_duplicates().shape[0])

Number of unique rows: 199002


In [18]:
verse_dataset.to_csv("/home/toure215/BERT_phonetic/DATASETS/verse_dataset.csv", index=False)

In [19]:
train, test = train_test_split(verse_dataset, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.1, random_state=42)

In [20]:
train.to_csv("/home/toure215/BERT_phonetic/DATASETS/verses/train.csv", index=False)
val.to_csv("/home/toure215/BERT_phonetic/DATASETS/verses/val.csv", index=False)
test.to_csv("/home/toure215/BERT_phonetic/DATASETS/verses/test.csv", index=False)

In [21]:
train = load_dataset("csv", data_files="/home/toure215/BERT_phonetic/DATASETS/verses/train.csv")
val = load_dataset("csv", data_files="/home/toure215/BERT_phonetic/DATASETS/verses/val.csv")
test = load_dataset("csv", data_files="/home/toure215/BERT_phonetic/DATASETS/verses/test.csv")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [22]:
print(train)
print(val)
print(test)

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 143280
    })
})
DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 15921
    })
})
DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 39801
    })
})


In [23]:
hf_dataset = DatasetDict({
    "train": train["train"],
    "validation": val["train"],
    "test": test["train"],
})
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 143280
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 15921
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 39801
    })
})

In [26]:
hf_dataset.save_to_disk("/home/toure215/BERT_phonetic/DATASETS/verses/hf_rhymes")

Saving the dataset (0/1 shards):   0%|          | 0/143280 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/15921 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/39801 [00:00<?, ? examples/s]

In [27]:
hf_dataset = load_from_disk("/home/toure215/BERT_phonetic/DATASETS/verses/hf_rhymes")
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 143280
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 15921
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 39801
    })
})