# Training of the Phonetic model

In [1]:
from src.pairing.dataset.phonetic_pair_dataset import PhoneticPairDataset
from src.pairing.dataset.phonetic_triplet_dataset import PhoneticTripletDataset
from src.pairing.model.phonetic_siamese import PhoneticSiamese
from src.pairing.training.config import CONFIG, LossType
from torch.utils.data import Dataset
from pathlib import Path

from pytorch_lightning.loggers import MLFlowLogger
from pytorch_lightning.utilities.seed import seed_everything
from torch.utils.data import DataLoader
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

import mlflow
import torch

seed_everything(0)

  from .autonotebook import tqdm as notebook_tqdm
Global seed set to 0


0

In [2]:
def get_dataset() -> Dataset:
    """Returns a Dataset object given loss type."""
    if CONFIG.loss_type == LossType.Pair:
        dataset = PhoneticPairDataset(
            best_pairs_path=CONFIG.best_pairs_dataset, worst_pairs_path=CONFIG.worst_pairs_dataset
        )
    elif CONFIG.loss_type == LossType.Triplet:
        dataset = PhoneticTripletDataset(
            best_pairs_path=CONFIG.best_pairs_dataset, worst_pairs_path=CONFIG.worst_pairs_dataset
        )
    else:
        raise ValueError(f'Unknown loss type given: {CONFIG.loss_type}')
    return dataset

dataset = get_dataset()

In [3]:
third = len(dataset)//3
train_set, val_set, test_set = torch.utils.data.random_split(
    dataset, [third, third, len(dataset)-2*third]
)

In [4]:
def instanciate(kwargs):
    train_dataloader = DataLoader(
        train_set, batch_size=kwargs["batch_size"], shuffle=True, num_workers=4
    )
    validation_dataloader = DataLoader(
        val_set, batch_size=kwargs["batch_size"], num_workers=4
    )
    test_dataloader = DataLoader(
        test_set, batch_size=kwargs["batch_size"], num_workers=4
    )
    model = PhoneticSiamese(
        embedding_dim=kwargs["embedding_dim"],
        dim_feedforward=kwargs["dim_feedforward"],
        nhead=kwargs["nhead"],
        dropout=kwargs["dropout"],
        loss_type=CONFIG.loss_type,
        batch_size=kwargs["batch_size"],
        weight_decay=kwargs["weight_decay"],
        lr=kwargs["lr"],
        margin=kwargs["margin"]
    )
    return {
        "train_dataloader": train_dataloader,
        "validation_dataloader": validation_dataloader,
        "test_dataloader": test_dataloader,
        "model": model,
    }

In [5]:
def train(
        dropout,
        lr,
        weight_decay,
        dim_feedforward,
        batch_size,
        nhead,
        embedding_dim,
        margin
):
    mlf_logger = MLFlowLogger(
        experiment_name=CONFIG.experiment_name, tracking_uri=CONFIG.log_folder
    )
    trainer = Trainer(
        max_epochs=CONFIG.max_epochs,
        logger=mlf_logger,
        callbacks=[EarlyStopping(monitor="validation_loss", mode="min")],
        accelerator="gpu", devices=1
    )
    instance = instanciate(
        {
            "dropout": dropout,
            "lr": lr,
            "weight_decay": weight_decay,
            "dim_feedforward": dim_feedforward,
            "batch_size": batch_size,
            "nhead": nhead,
            "embedding_dim": embedding_dim,
            "margin": margin,
            "model": "phonetic_siamese"
        }
    )

    mlflow.pytorch.autolog()

    with mlflow.start_run():
        model = fit_model(
            instance["model"],
            instance["train_dataloader"],
            instance["validation_dataloader"],
            trainer,
        )

        test_loss = test_model(model, instance["test_dataloader"], trainer)[0]["test_loss"]

        torch.save(model.state_dict(), "model_dict")
        mlflow.log_artifact("model_dict", "model_dict")

    return model, test_loss

def fit_model(model, train_dataloader, validation_dataloader, trainer):
    trainer.fit(model, train_dataloader, validation_dataloader)
    return model

def test_model(model, test_dataloader, trainer):
    return trainer.test(model, test_dataloader, verbose=False)

In [6]:
model, test_loss = train(
    dropout=0.2,
    lr=1e-3,
    weight_decay=1e-3,
    dim_feedforward=16,
    batch_size=16,
    nhead=2,
    embedding_dim=16,
    margin=0.2
)

print(f'Final test loss: {test_loss}')

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type                    | Params
---------------------------------------------------------------
0 | embedding          | Embedding               | 9.4 K 
1 | encoder            | TransformerEncoderLayer | 1.7 K 
2 | cos                | CosineSimilarity        | 0     
3 | p_enc_1d_model     | PositionalEncoding1D    | 0     
4 | p_enc_1d_model_sum | Summer                  | 0     
5 | triplet_loss       | TripletMarginLoss       | 0     
---------------------------------------------------------------
11.1 K    Trainable params
0         Non-trainable params
11.1 K    Total params
0.044     Total estimated model params size (MB)


Epoch 11: 100%|██████████| 110/110 [00:14<00:00,  7.35it/s, loss=0, v_num=0b1c, validation_loss=0.014, training_loss=0.0116]         


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 55/55 [00:00<00:00, 166.67it/s]
Final test loss: 0.004024611786007881


## Inference example

In [7]:
device = torch.device('cuda:0')

In [8]:
import numpy as np
model.eval().to(device)

for i in np.random.randint(0, len(test_set), 10):
    sample = test_set[i]
    anchor_match = sample['anchor_phonetic']
    positive_match = sample['similar_phonetic']
    negative_match = sample['distant_phonetic']

    anchor_embedding = model.encode([anchor_match])
    positive_embedding = model.encode([positive_match])
    negative_embedding = model.encode([negative_match])

    loss = model.triplet_loss(anchor_embedding, positive_embedding, negative_embedding)

    positive_dist = torch.cdist(anchor_embedding, positive_embedding, p=2)
    negative_dist = torch.cdist(anchor_embedding, negative_embedding, p=2)

    print(f"Loss: {loss}")
    print(f"Positive: {positive_dist}")
    print(f"Negative: {negative_dist}")

Loss: 0.0
Positive: tensor([[11.8748]], device='cuda:0', grad_fn=<CdistBackward0>)
Negative: tensor([[21.9532]], device='cuda:0', grad_fn=<CdistBackward0>)
Loss: 0.0
Positive: tensor([[6.4286]], device='cuda:0', grad_fn=<CdistBackward0>)
Negative: tensor([[9.6256]], device='cuda:0', grad_fn=<CdistBackward0>)
Loss: 0.0
Positive: tensor([[8.8924]], device='cuda:0', grad_fn=<CdistBackward0>)
Negative: tensor([[10.4454]], device='cuda:0', grad_fn=<CdistBackward0>)
Loss: 0.4046163558959961
Positive: tensor([[8.4165]], device='cuda:0', grad_fn=<CdistBackward0>)
Negative: tensor([[8.2118]], device='cuda:0', grad_fn=<CdistBackward0>)
Loss: 0.0
Positive: tensor([[8.1106]], device='cuda:0', grad_fn=<CdistBackward0>)
Negative: tensor([[9.7890]], device='cuda:0', grad_fn=<CdistBackward0>)
Loss: 0.9210586547851562
Positive: tensor([[8.8377]], device='cuda:0', grad_fn=<CdistBackward0>)
Negative: tensor([[8.1167]], device='cuda:0', grad_fn=<CdistBackward0>)
Loss: 0.0
Positive: tensor([[5.1002]], devi

In [9]:
from eng_to_ipa import convert
words = ['dog', 'parade', 'cascade', "palace", "table"]
ipas = [convert(x) for x in words]

device = torch.device('cuda:0')
model.eval().to(device)
embdedings = model.encode(ipas)

a = torch.cdist(embdedings[1].view(1, -1), embdedings[0].view(1, -1), p=2)
b = torch.cdist(embdedings[1].view(1, -1), embdedings[2].view(1, -1), p=2)
c = torch.cdist(embdedings[1].view(1, -1), embdedings[3].view(1, -1), p=2)
d = torch.cdist(embdedings[1].view(1, -1), embdedings[4].view(1, -1), p=2)

a, b, c, d

(tensor([[8.9612]], device='cuda:0', grad_fn=<CdistBackward0>),
 tensor([[13.3801]], device='cuda:0', grad_fn=<CdistBackward0>),
 tensor([[5.5410]], device='cuda:0', grad_fn=<CdistBackward0>),
 tensor([[7.6848]], device='cuda:0', grad_fn=<CdistBackward0>))