In [1]:
from src.dataset.phonetic_pair_dataset_v2 import PhoneticPairDataset
from src.model.sound_siamese_v2 import SoundSiamese
from pytorch_lightning.loggers import MLFlowLogger
from pytorch_lightning.utilities.seed import seed_everything

import torch
from torch.utils.data import DataLoader
from pytorch_lightning import Trainer 
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from typing import List
from eng_to_ipa import convert

seed_everything(0)

  from .autonotebook import tqdm as notebook_tqdm
Global seed set to 0


0

# Model Training

In [2]:
mlf_logger = MLFlowLogger(experiment_name="lightning_logs", tracking_uri="file:./mlruns")

# Dataset preparation
dataset = PhoneticPairDataset(best_pairs_path='best_pairs.csv', worst_pairs_path='worst_pairs.csv')
train_set, val_set = torch.utils.data.random_split(dataset, [len(dataset) - 100, 100])

train_dataloader = DataLoader(train_set, batch_size=4, shuffle=True, num_workers=4)
validation_dataloader = DataLoader(val_set, batch_size=4, num_workers=4)
trainer = Trainer(
    max_epochs=10, 
    logger=mlf_logger, 
    # callbacks=[EarlyStopping(monitor="validation_loss", mode="min")]
    )

# Model training
model = SoundSiamese(
    embedding_dim= 16, 
    dropout = 0, 
    add_positional = False
)
trainer.fit(model, train_dataloader, validation_dataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name               | Type                 | Params
------------------------------------------------------------
0 | embedding          | Embedding            | 9.4 K 
1 | cos                | CosineSimilarity     | 0     
2 | p_enc_1d_model     | PositionalEncoding1D | 0     
3 | p_enc_1d_model_sum | Summer               | 0     
4 | self_attn          | MultiheadAttention   | 1.1 K 
5 | linear1            | Linear               | 1.1 K 
------------------------------------------------------------
11.6 K    Trainable params
0         Non-trainable params
11.6 K    Total params
0.046     Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

RuntimeError: mat1 and mat2 shapes cannot be multiplied (200x16 and 32x32)

# Saving model

In [None]:
import torch
from pathlib import Path

torch.save(model.state_dict(), "model_dict")

# Visualization

In [None]:
import umap
import matplotlib.pyplot as plt
import numpy as np
reducer = umap.UMAP()

## Embedding visualization

In [None]:
embedding_w = model.embedding.weight.detach().numpy()
embedding_u = reducer.fit_transform(embedding_w)

plt.scatter(
    embedding_u[:, 0],
    embedding_u[:, 1]
)
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the Embeddings', fontsize=24)

## Encoding viusalization

In [None]:
viz_dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
embedding_w = np.array([x.detach().numpy() for x in model.encode([x['english_phonetic'][0] for x in iter(viz_dataloader)])])

embedding_u = reducer.fit_transform(embedding_w)

plt.scatter(
    embedding_u[:, 0],
    embedding_u[:, 1]
)
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the Word Encodings', fontsize=24)

# Inference

In [None]:
def load_model():
    model = SoundSiamese()
    model.load_state_dict(torch.load("model_dict"))

def convert_eng_to_ipa(word: str):
    return (
        convert(word, keep_punct=False)
        .replace("ˈ", "")
        .replace("ˌ", "")
        .replace(" ", "")
    )

def encode_english(words: List[str]):
    ipas = [convert_eng_to_ipa(w) for w in words]
    return model.encode(ipas) 

In [None]:
a, b, c, d, e = encode_english(['cat', 'dog',  'dodge', 'chat', 'god'])
a, b, c, d, e

In [None]:
print('a x b', model.cos(a.view(1,-1), b.view(1,-1)))
print('b x c', model.cos(b.view(1,-1), c.view(1,-1)))
print('a x d', model.cos(a.view(1,-1), d.view(1,-1)))
print('b x d', model.cos(b.view(1,-1), d.view(1,-1)))