In [6]:
import random
import sys
import epitran
import os

In [2]:
"""
Python wrapper for the website: https://www.homophone.com/
Gets the homophones of a word.
"""

from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
from typing import Dict, List
import re

class Pyphones:
    
    def __init__(self, word):
        self.word = word
        self.url = "https://www.homophone.com/search?page={}&type=&q={}"
        self.homophones = {self.word: []}
        
    def get_the_page(self, page_no=1):
        """
        Get the page content.

        Returns
            str: the content of the page.
        """
        url = self.url.format(page_no, self.word)
        r = requests.get(url)
        soup = BeautifulSoup(r.content, "html.parser")
        return soup

    def get_the_page_nos(self):
        """
        Get the total number of pages

        Returns
            int: the total number of the pages.
        """
        soup = self.get_the_page()
        pages = soup.find_all('div', attrs={'class':'col-sm-9'})
        if not pages:
            return 0
        total_pages = pages[0].find('h5').text.split('/')[-1].strip()
        return int(total_pages)

    def get_the_homophones(self):
        """
        Get the homophones of the word.

        Returns
            dict: {word: [list_of_homophones]} against each word.
        """
        total_pages = self.get_the_page_nos()
        for ix in range(total_pages):
            page_no = ix + 1
            soup = self.get_the_page(page_no)
            raw_homophones = soup.find_all('div', attrs={'class': 'well well-lg'})
            for elem in range(len(raw_homophones)):
                raw_homophones_2 = raw_homophones[elem].find_all('a', attrs={'class': 'btn word-btn'})
                list_of_homophones = list(raw_homophones_2)
                if any(list_of_homophones):
                    local_homophones = []
                    for tag_of_homophone in list_of_homophones:
                        homophone = tag_of_homophone.text
                        local_homophones.append(homophone)
                    self.homophones[self.word].append(local_homophones)

        return self.homophones

In [19]:
!wc -l "/home/toure215/BERT_phonetic/DATASETS/words2.txt"

69905 /home/toure215/BERT_phonetic/DATASETS/words2.txt


In [27]:
with open("/home/toure215/BERT_phonetic/DATASETS/words2.txt") as f:
    words = f.readlines()
words = [x.strip() for x in words]
print(len(words))
def f(word):
    condition = "(" in word or ")" in word or "-" in word or " " in word or "'" in word or "." in word
    return not condition
words = filter(f, words)
words = list(words)
print(len(words))

with open("/home/toure215/BERT_phonetic/DATASETS/words.txt", "w") as f:
    for word in words:
        f.write(word + "\n")

69905
67585


In [28]:
from concurrent.futures import ThreadPoolExecutor

def fetch_homophones(word):
    p = Pyphones(word)
    h = p.get_the_homophones()
    return h

homophones = set()
processed_words = set()

with open("/home/toure215/BERT_phonetic/DATASETS/words.txt") as f:
    words = f.readlines()
    words = [word.strip() for word in words]

    with ThreadPoolExecutor(max_workers=15) as executor:
        future_to_word = {executor.submit(fetch_homophones, word): word for word in words}
        
        for future in future_to_word:
            word = future_to_word[future]
            if word not in processed_words:
                try:
                    h = future.result()
                    for v in h[word]:
                        homophones.add(tuple(v))
                    processed_words.add(word)
                except Exception as exc:
                    print(f'{word} generated an exception: {exc}')

In [46]:
print(len(homophones))

3195


In [51]:
#save to file
with open("/home/toure215/BERT_phonetic/DATASETS/homophones_data/homophones.csv", "w") as f:
    for h in homophones:
        for i, word in enumerate(h):
            f.write(word)
            if i < len(h) - 1:
                f.write(",")

        f.write("\n")

In [37]:
import pandas as pd
import epitran
from functools import lru_cache
from difflib import SequenceMatcher
from collections import defaultdict

# Initialize Epitran for English
epi = epitran.Epitran("eng-Latn")

@lru_cache(maxsize=None)
def cached_xsampa_list(word):
    return ''.join(epi.xsampa_list(word))

words = []
with open("/home/toure215/BERT_phonetic/DATASETS/words.txt") as f:
    words = f.readlines()
    words = [word.strip() for word in words]

# Precompute phonetic transcriptions for all words
phonetic_dict = {word: cached_xsampa_list(word) for word in words}

def get_phonetic(word):
    return phonetic_dict[word]

# Group words by length to reduce comparisons
length_buckets = defaultdict(list)
for word in words:
    length_buckets[len(word)].append(word)

# Similar spelling threshold
SIMILARITY_THRESHOLD = 0.8

# Create pairs with optimized checks
non_homophone_pairs = []
for length, word_list in length_buckets.items():
    for i, word1 in enumerate(word_list):
        phonetic1 = get_phonetic(word1)
        for j in range(i + 1, len(word_list)):
            word2 = word_list[j]
            phonetic2 = get_phonetic(word2)
            # Check if words are phonetically different and spelling-similar
            if phonetic1 != phonetic2:
                # Calculate spelling similarity
                similarity = SequenceMatcher(None, word1, word2).ratio()
                if similarity > SIMILARITY_THRESHOLD:
                    non_homophone_pairs.append((word1, word2))

# Convert to DataFrame for easier handling
df_non_homophones = pd.DataFrame(non_homophone_pairs, columns=["word1", "word2"])
print(df_non_homophones.head())
print(phonetic_dict)

      word1     word2
0  abbatial  sabbatia
1  abdicant  abdicate
2  abditory  auditory
3  abducent  adducent
4  abductor  adductor


In [50]:
print(len(df_non_homophones))
df_non_homophones.to_csv("/home/toure215/BERT_phonetic/DATASETS/homophones_data/non_homophones.csv", index=False)

22094


In [52]:
df_homophones = pd.DataFrame(columns=["word1", "word2", "label"])
with open("/home/toure215/BERT_phonetic/DATASETS/homophones_data/homophones.csv") as f:
    for line in f:
        words = line.strip().split(",")
        for i in range(len(words)):
            for j in range(i + 1, len(words)):
                last_row = df_homophones.shape[0]
                df_homophones.loc[last_row] = [words[i], words[j], 1]

print(df_homophones.head())
print(len(df_homophones))

  word1 word2  label
0  lase  lays      1
1  lase  laze      1
2  lase  leas      1
3  lase  leis      1
4  lase  leys      1
6039


In [56]:
sample_size = len(df_non_homophones)
df_non_homophones["label"] = 0
df_non_homophones_sampled = df_non_homophones.sample(n=sample_size, random_state=42)
df_all = pd.concat([df_homophones, df_non_homophones_sampled])
print(len(df_all))

28133


In [57]:
df_all = df_all.sample(frac=1).reset_index(drop=True)
df_all.to_csv("/home/toure215/BERT_phonetic/DATASETS/homophones_data/all.csv", index=False)
print(df_all.head(20))

          word1        word2  label
0        modish       oldish      0
1   isochronism  synchronism      0
2   illflavored  wellfavored      0
3    counsellor    councilor      1
4       confirm      conform      0
5        pocket       rocket      0
6       clatter      clutter      0
7           hay          hey      1
8        penman       penmen      0
9          sack       sacque      1
10       chiton       citron      0
11          air         aire      1
12       caltha       cathay      0
13     hereford     therefor      0
14        dhole         dole      1
15       acetal       acheta      0
16      lacking      locking      0
17      gushing      pushing      0
18       dourly       hourly      0
19     mindless     windless      0


In [58]:
from datasets import load_dataset


In [108]:
hf_dataset = load_dataset("csv", data_files="/home/toure215/BERT_phonetic/DATASETS/homophones_data/all.csv", split="train")
print(hf_dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['word1', 'word2', 'label'],
    num_rows: 28133
})


In [109]:
hf_dataset = hf_dataset.train_test_split(test_size=0.1, seed=42)
print(hf_dataset)

DatasetDict({
    train: Dataset({
        features: ['word1', 'word2', 'label'],
        num_rows: 25319
    })
    test: Dataset({
        features: ['word1', 'word2', 'label'],
        num_rows: 2814
    })
})


In [110]:
temp = hf_dataset["train"]
temp = temp.train_test_split(test_size=0.1, seed=42)
hf_dataset["train"] = temp["train"]
hf_dataset["validation"] = temp["test"]
print(hf_dataset)

DatasetDict({
    train: Dataset({
        features: ['word1', 'word2', 'label'],
        num_rows: 22787
    })
    test: Dataset({
        features: ['word1', 'word2', 'label'],
        num_rows: 2814
    })
    validation: Dataset({
        features: ['word1', 'word2', 'label'],
        num_rows: 2532
    })
})


In [111]:
hf_dataset.save_to_disk("/home/toure215/BERT_phonetic/DATASETS/homophones_data/hf_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/22787 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2814 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2532 [00:00<?, ? examples/s]

In [78]:
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

In [122]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [123]:
def tokenize_function(examples):
    return tokenizer(examples["word1"], examples["word2"], padding=False, truncation=True, max_length=128)

tokenized_datasets = hf_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/22787 [00:00<?, ? examples/s]

Map:   0%|          | 0/2814 [00:00<?, ? examples/s]

Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

In [124]:
training_args = TrainingArguments(
    output_dir="/tmp/homophones",
    num_train_epochs=3,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    evaluation_strategy="epoch",
    logging_strategy="no",
    save_strategy="no",
    report_to="none",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [125]:
trainer.train()

  0%|          | 0/270 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.18578596413135529, 'eval_runtime': 0.1831, 'eval_samples_per_second': 13831.626, 'eval_steps_per_second': 54.627, 'epoch': 1.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.11996571719646454, 'eval_runtime': 0.1774, 'eval_samples_per_second': 14268.966, 'eval_steps_per_second': 56.355, 'epoch': 2.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.11124055087566376, 'eval_runtime': 0.1792, 'eval_samples_per_second': 14128.041, 'eval_steps_per_second': 55.798, 'epoch': 3.0}
{'train_runtime': 13.7516, 'train_samples_per_second': 4971.14, 'train_steps_per_second': 19.634, 'train_loss': 0.15613762184425636, 'epoch': 3.0}


TrainOutput(global_step=270, training_loss=0.15613762184425636, metrics={'train_runtime': 13.7516, 'train_samples_per_second': 4971.14, 'train_steps_per_second': 19.634, 'total_flos': 464825762618280.0, 'train_loss': 0.15613762184425636, 'epoch': 3.0})

In [126]:
import evaluate
from scipy.special import softmax   

predictions = trainer.predict(tokenized_datasets["test"])
preds, labels = predictions.predictions, predictions.label_ids
print(preds.shape)

  0%|          | 0/11 [00:00<?, ?it/s]

(2814, 2)


In [127]:
pred_scores = softmax(preds, axis=1)[:, 1]
roc = evaluate.load("roc_auc")
roc_score = roc.compute(references=labels, prediction_scores=pred_scores)
print(roc_score)

{'roc_auc': np.float64(0.9861180119603702)}


In [128]:
preds = preds.argmax(axis=1)
accuracy = (preds == labels).mean()
print(accuracy)

0.9637526652452025


In [100]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_from_disk 

In [99]:
model = AutoModelForSequenceClassification.from_pretrained("psktoure/BERT_BPE_phonetic_wikitext-103-raw-v1")
tokenizer = AutoTokenizer.from_pretrained("psktoure/BERT_BPE_phonetic_wikitext-103-raw-v1")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at psktoure/BERT_BPE_phonetic_wikitext-103-raw-v1 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [112]:
hf_dataset = load_from_disk("/home/toure215/BERT_phonetic/DATASETS/homophones_data/hf_dataset")
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['word1', 'word2', 'label'],
        num_rows: 22787
    })
    test: Dataset({
        features: ['word1', 'word2', 'label'],
        num_rows: 2814
    })
    validation: Dataset({
        features: ['word1', 'word2', 'label'],
        num_rows: 2532
    })
})

In [114]:
def translate_to_phonetic(examples):
    return {
        "word1": cached_xsampa_list(examples["word1"]),
        "word2": cached_xsampa_list(examples["word2"]),
        "label": examples["label"]
    }

phonetic_dataset = hf_dataset.map(translate_to_phonetic, num_proc=15)
phonetic_dataset

DatasetDict({
    train: Dataset({
        features: ['word1', 'word2', 'label'],
        num_rows: 22787
    })
    test: Dataset({
        features: ['word1', 'word2', 'label'],
        num_rows: 2814
    })
    validation: Dataset({
        features: ['word1', 'word2', 'label'],
        num_rows: 2532
    })
})

In [115]:
tokenized_datasets = phonetic_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/22787 [00:00<?, ? examples/s]

Map:   0%|          | 0/2814 [00:00<?, ? examples/s]

Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

In [116]:
training_args = TrainingArguments(
    output_dir="/tmp/homophones",
    num_train_epochs=3,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    evaluation_strategy="epoch",
    logging_strategy="no",
    save_strategy="no",
    report_to="none",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [117]:
trainer.train()

  0%|          | 0/270 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.08429022878408432, 'eval_runtime': 0.2252, 'eval_samples_per_second': 11240.879, 'eval_steps_per_second': 44.395, 'epoch': 1.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.06720872223377228, 'eval_runtime': 0.2213, 'eval_samples_per_second': 11439.368, 'eval_steps_per_second': 45.179, 'epoch': 2.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.07151732593774796, 'eval_runtime': 0.2248, 'eval_samples_per_second': 11264.212, 'eval_steps_per_second': 44.487, 'epoch': 3.0}
{'train_runtime': 16.3682, 'train_samples_per_second': 4176.464, 'train_steps_per_second': 16.495, 'train_loss': 0.020849764788592302, 'epoch': 3.0}


TrainOutput(global_step=270, training_loss=0.020849764788592302, metrics={'train_runtime': 16.3682, 'train_samples_per_second': 4176.464, 'train_steps_per_second': 16.495, 'total_flos': 743616592433640.0, 'train_loss': 0.020849764788592302, 'epoch': 3.0})

In [118]:
predictions = trainer.predict(tokenized_datasets["test"])
preds, labels = predictions.predictions, predictions.label_ids

  0%|          | 0/11 [00:00<?, ?it/s]

In [121]:
from scipy.special import softmax

pred_scores = softmax(preds, axis=1)[:, 1]
roc_score = roc.compute(references=labels, prediction_scores=pred_scores)
print(roc_score)

{'roc_auc': np.float64(0.9934794412543512)}
