In [1]:
import os
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("duketemon/wordnet-synonyms")

print("Path to dataset files:", path)

Path to dataset files: /home/toure215/.cache/kagglehub/datasets/duketemon/wordnet-synonyms/versions/2


In [3]:
files = os.listdir(path)
print("Files in dataset:", files)

Files in dataset: ['synonyms.csv', 'synonyms.json']


In [4]:
dataset_path = os.path.join(path, files[0])
pd_dataset = pd.read_csv(dataset_path)
print("Dataset shape:", pd_dataset.shape)
print(pd_dataset.head())
print("lenght of dataset:", len(pd_dataset))
print("Dataset columns:", pd_dataset.columns)

Dataset shape: (127001, 3)
         lemma part_of_speech                             synonyms
0  .22-caliber      adjective  .22 caliber;.22 calibre;.22-calibre
1  .22-calibre      adjective  .22 caliber;.22-caliber;.22 calibre
2  .22 caliber      adjective  .22-caliber;.22 calibre;.22-calibre
3  .22 calibre      adjective  .22 caliber;.22-caliber;.22-calibre
4  .38-caliber      adjective  .38 caliber;.38 calibre;.38-calibre
lenght of dataset: 127001
Dataset columns: Index(['lemma', 'part_of_speech', 'synonyms'], dtype='object')


In [5]:
seen = set()
synonyms = []
for i, row in pd_dataset.iterrows():
    word = row["lemma"]
    if isinstance(word, str) and (word in seen or word[0].isupper()):
        continue
    syns = set()
    if isinstance(word, str):
        syns.add(word)
    row = row.fillna("")
    for syn in row["synonyms"].split(";"):
        s = syn.strip()
        s = s.split("|")
        for w in s:
            syns.add(w)
            seen.add(w)
    if isinstance(word, str):
        seen.add(word)
    synonyms.append(syns)

In [6]:
print("len(synonyms):", len(synonyms))
random_idx = np.random.randint(0, len(synonyms), size=10)
random_synonyms = [synonyms[i] for i in random_idx]
print("synonyms:", random_synonyms)

len(synonyms): 70313
synonyms: [{'Karl Menninger', 'karl menninger', 'Menninger', 'Karl Augustus Menninger'}, {'snaffle', 'snaffle bit'}, {'resurrection plant', 'Anastatica hierochuntica', 'anastatica hierochuntica', 'rose of Jericho'}, {'flower arrangement', 'floral arrangement'}, {'Pan American Day', 'April 14', 'april 14'}, {'order Scleroparei', 'Scleroparei', 'order scleroparei'}, {"binder's board", 'binder board'}, {'civil law', 'jus civile', 'roman law', 'Roman law', 'Justinian code'}, {'malacca cane', 'malacca'}, {'quandang', 'quandong', 'quantong', 'native peach'}]


In [7]:
synonyms_pairs = []
for syns in synonyms:
    syns = list(syns)
    for i in range(len(syns)):
        for j in range(i + 1, len(syns)):
            synonyms_pairs.append((syns[i], syns[j]))

print("len(synonyms_pairs):", len(synonyms_pairs))

len(synonyms_pairs): 482803


In [8]:
non_synonyms_pairs = []
n = 0
while n < len(synonyms_pairs):
    i = np.random.randint(len(synonyms_pairs))
    j = np.random.randint(len(synonyms_pairs))
    if i == j:
        continue
    non_synonyms_pairs.append((synonyms_pairs[i][0], synonyms_pairs[j][1]))
    n += 1

print("len(non_synonyms_pairs):", len(non_synonyms_pairs))

len(non_synonyms_pairs): 482803


In [9]:
pd_synonyms = pd.DataFrame(synonyms_pairs, columns=["word1", "word2"])
pd_synonyms["label"] = 1
pd_non_synonyms = pd.DataFrame(non_synonyms_pairs, columns=["word1", "word2"])
pd_non_synonyms["label"] = 0

pd_dataset = pd.concat([pd_synonyms, pd_non_synonyms], ignore_index=True)
pd_dataset = pd_dataset.sample(frac=1).reset_index(drop=True)
print("pd_dataset shape:", pd_dataset.shape)
print(pd_dataset.head(20))

pd_dataset shape: (965606, 3)
                    word1                 word2  label
0                     ace      genus ozothamnus      0
1             agriculture              pin down      0
2                 gauguin               drunken      0
3                  ruffle         genus Cacicus      0
4        Ciudad de Mexico      volcan de colima      0
5                 Semitic               semitic      1
6              James Mill     milling machinery      1
7                 Maltese       Maltese terrier      1
8        chinese primrose      Primula sinensis      1
9                  nobble              overturn      1
10                 genius  Shigella dysentariae      0
11                work on                  work      1
12    family Haloragaceae          Black Prince      0
13           shall-flower     tall meadow grass      0
14  family Plumbaginaceae        Plumbaginaceae      1
15                   bind        fragrant sumac      0
16             gloomy Gus          

In [10]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(pd_dataset, test_size=0.2)
train, val = train_test_split(train, test_size=0.1)

train = Dataset.from_pandas(train, preserve_index=False)
val = Dataset.from_pandas(val, preserve_index=False)
test = Dataset.from_pandas(test, preserve_index=False)

dataset = DatasetDict({"train": train, "validation": val, "test": test})
dataset

DatasetDict({
    train: Dataset({
        features: ['word1', 'word2', 'label'],
        num_rows: 695235
    })
    validation: Dataset({
        features: ['word1', 'word2', 'label'],
        num_rows: 77249
    })
    test: Dataset({
        features: ['word1', 'word2', 'label'],
        num_rows: 193122
    })
})

In [11]:
dataset.save_to_disk("/home/toure215/BERT_phonetic/DATASETS/synonyms/synonyms_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/695235 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/77249 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/193122 [00:00<?, ? examples/s]

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def tokenize_function(examples):
    return tokenizer(
        examples["word1"],
        examples["word2"],
        padding=False,
        truncation=True,
        max_length=128,
    )


dataset_tokenized = dataset.map(
    tokenize_function, batched=True, remove_columns=["word1", "word2"], num_proc=15
)

Map (num_proc=15):   0%|          | 0/695235 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/77249 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/193122 [00:00<?, ? examples/s]

In [14]:
batch_size = 256
data_collector = DataCollatorWithPadding(tokenizer)

training_args = TrainingArguments(
    output_dir=f"/tmp/bb",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    logging_strategy="no",
    save_strategy="no",
    overwrite_output_dir=True,
    fp16=True,
    seed=np.random.randint(1e6),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["validation"],
    data_collator=data_collector,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [15]:
trainer.train()

  0%|          | 0/8148 [00:00<?, ?it/s]

  0%|          | 0/302 [00:00<?, ?it/s]

{'eval_loss': 0.131100133061409, 'eval_runtime': 6.7752, 'eval_samples_per_second': 11401.778, 'eval_steps_per_second': 44.575, 'epoch': 1.0}


  0%|          | 0/302 [00:00<?, ?it/s]

{'eval_loss': 0.11025826632976532, 'eval_runtime': 7.1807, 'eval_samples_per_second': 10757.857, 'eval_steps_per_second': 42.057, 'epoch': 2.0}


  0%|          | 0/302 [00:00<?, ?it/s]

{'eval_loss': 0.1121121272444725, 'eval_runtime': 7.0396, 'eval_samples_per_second': 10973.431, 'eval_steps_per_second': 42.9, 'epoch': 3.0}
{'train_runtime': 569.3705, 'train_samples_per_second': 3663.177, 'train_steps_per_second': 14.311, 'train_loss': 0.11309030164850653, 'epoch': 3.0}


TrainOutput(global_step=8148, training_loss=0.11309030164850653, metrics={'train_runtime': 569.3705, 'train_samples_per_second': 3663.177, 'train_steps_per_second': 14.311, 'total_flos': 2.112621903741342e+16, 'train_loss': 0.11309030164850653, 'epoch': 3.0})

In [16]:
predictions, labels, _ = trainer.predict(dataset_tokenized["test"])
predictions = np.argmax(predictions, axis=1)
accuracy = np.mean(predictions == labels)
print("Accuracy:", accuracy)

  0%|          | 0/755 [00:00<?, ?it/s]

Accuracy: 0.966471971085635


In [17]:
homophone_dataset = load_from_disk(
    "/home/toure215/BERT_phonetic/DATASETS/homophones_data/hf_dataset"
)

In [18]:
tokenized_dataset = homophone_dataset.map(
    tokenize_function, batched=True, remove_columns=["word1", "word2"], num_proc=15
)

Map (num_proc=15):   0%|          | 0/22787 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/2814 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/2532 [00:00<?, ? examples/s]

In [19]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=1)
accuracy = np.mean(predictions == labels)
print("Accuracy on homophones:", accuracy)

  0%|          | 0/11 [00:00<?, ?it/s]

Accuracy on homophones: 0.35678749111584934
