In [1]:
import os
import random
from typing import Tuple, Dict

import numpy as np
import torch
from torch import optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
from dotenv import load_dotenv
import torch.nn.functional as F

import MD.main as main
import pickle
import json

In [2]:
load_dotenv()
data_path = r'{}'.format(os.environ['DATASET_PATH'])

# Получение данных
id_list = os.listdir(data_path)
batch_size = 16
mfcc_count = 40
target_sr = 16000

In [4]:
with open('voice_params_50pers_5_40mfccs.pkl', 'rb') as f:
    voice_params = pickle.load(f)

In [3]:
max_voices = 3
segment_length = 0
clear = True

# Получение голосовых признаков
voice_params = {}
pickle_file = r'voice_params_100pers_3_40mfccs_no_segm.pkl'
for person_id in id_list:
    files = main.get_audio_for_id(data_path, person_id)
    person_params = []
    voice_cnt = 0
    for file in files:
        normilize_audio = main.preprocess_audio(file, target_sr=target_sr, segment_length=segment_length, clear=clear)
        if segment_length:
            person_params.extend(
                [main.get_mfccs(audio, sample_rate=target_sr, n_mfcc=mfcc_count) for audio in normilize_audio])
        else:
            person_params.append(main.get_mfccs(normilize_audio, sample_rate=target_sr, n_mfcc=mfcc_count))
        voice_cnt += 1
        if voice_cnt >= max_voices:
            break
    voice_params[person_id] = person_params
    print(f'Person {person_id} saved.')
with open(pickle_file, 'wb') as f:
    pickle.dump(voice_params, f)

  power_ratio = min(power_ratio, 1 / power_ratio)
  power_ratio = phrases[next_i]['power'] / phrases[i]['power']


Person id10001 saved.
Person id10002 saved.
Person id10003 saved.
Person id10004 saved.
Person id10005 saved.
Person id10006 saved.
Person id10007 saved.
Person id10008 saved.
Person id10009 saved.
Person id10010 saved.
Person id10011 saved.
Person id10012 saved.
Person id10013 saved.
Person id10014 saved.
Person id10015 saved.
Person id10016 saved.
Person id10017 saved.
Person id10018 saved.
Person id10019 saved.
Person id10020 saved.
Person id10021 saved.
Person id10022 saved.
Person id10023 saved.
Person id10024 saved.
Person id10025 saved.
Person id10026 saved.
Person id10027 saved.
Person id10028 saved.
Person id10029 saved.
Person id10030 saved.
Person id10031 saved.
Person id10032 saved.
Person id10033 saved.
Person id10034 saved.
Person id10035 saved.
Person id10036 saved.
Person id10037 saved.
Person id10038 saved.
Person id10039 saved.
Person id10040 saved.
Person id10041 saved.
Person id10042 saved.
Person id10043 saved.
Person id10044 saved.
Person id10045 saved.
Person id1

ValueError: operands could not be broadcast together with shapes (0,) (6080,) 

In [34]:
voice_params['id10004'][0].shape

(40, 286)

In [4]:
class VoiceEmbeddingModel(nn.Module):
    def __init__(self, input_size: int = 40, channels_size: int = 128, lstm_out_size: int = 128):
        super(VoiceEmbeddingModel, self).__init__()
        self.conv1 = nn.Conv2d(input_size, channels_size, 5, padding=2)
        self.bn1 = nn.BatchNorm1d(channels_size)
        self.pool = nn.MaxPool1d(2)
        self.conv2 = nn.Conv1d(channels_size, channels_size, 5, padding=2)
        self.flatten = nn.Flatten()
        # num_channels = channels_size * (128//4)
        self.lstm1 = nn.LSTM(channels_size, lstm_out_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(lstm_out_size * 2, 256)
        self.fc2 = nn.Linear(256, 128)  # Предположим, размерность эмбеддинга 128

    def forward(self, x):
        x = self.pool(nn.functional.relu(self.bn1(self.conv1(x))))
        x = self.pool(nn.functional.relu(self.bn1(self.conv2(x))))
        x = x.transpose(1, 2)
        x, _ = self.lstm1(x)
        x = x[:, -1, :]
        # x = self.flatten(x)
        x = self.dropout(nn.functional.relu(self.fc1(x)))
        x = self.fc2(x)
        return x


class VoiceEmbeddingCNN(nn.Module):
    def __init__(self):
        super(VoiceEmbeddingCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=(3, 3), padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=(3, 3), padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(32 * 5 * 8, 128)

    def forward(self, x):
        x = x.unsqueeze(1)  # Добавляем размерность канала
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x)
        x = x.view(-1, 32 * 5 * 8)  # Выравниваем в одномерный вектор
        x = F.relu(self.fc1(x))
        return x

In [5]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

    def forward(self, x):
        identity = x
        out = self.conv1(x)
        out = F.relu(self.bn1(out))
        out = self.conv2(out)
        out = self.bn2(out)
        out += identity  # Add the residual (skip) connection
        return F.relu(out)


class ResCNN(nn.Module):
    def __init__(self, in_channels=1):
        super(ResCNN, self).__init__()

        # Первоначальный свёрточный слой
        self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=5, stride=2, padding=2)
        self.bn1 = nn.BatchNorm2d(64)

        # Residual блоки
        self.res_block1 = ResidualBlock(64, 64)
        self.res_block2 = ResidualBlock(128, 128)
        self.res_block3 = ResidualBlock(256, 256)
        self.res_block4 = ResidualBlock(256, 512)

        # Последующие свёрточные слои
        self.conv2 = nn.Conv2d(64, 128, kernel_size=5, stride=2, padding=2)
        self.bn2 = nn.BatchNorm2d(128)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=5, stride=2, padding=2)
        self.bn3 = nn.BatchNorm2d(256)
        self.conv4 = nn.Conv2d(256, 512, kernel_size=5, stride=2, padding=2)
        self.bn4 = nn.BatchNorm2d(512)

        # Временной пуллинг
        self.pool = nn.AvgPool2d((1, 1))

        # Полносвязный слой
        self.fc = nn.Linear(512, 512)

        # Нормализация длины
        self.ln = nn.BatchNorm1d(512)

    def forward(self, x):
        # Начальный свёрточный слой
        x = self.conv1(x)
        x = F.relu(self.bn1(x))

        # Пропуск через рес-блоки
        x = self.res_block1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.res_block2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.res_block3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.res_block4(x)

        # Временной пуллинг
        x = self.pool(x)

        # Полносвязный слой и нормализация длины
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        x = self.ln(x)

        return x

In [6]:
class ClippedReLU(nn.Module):
    def __init__(self, max_value=20):
        super(ClippedReLU, self).__init__()
        self.max_value = max_value

    def forward(self, x):
        return x.clamp(min=0, max=self.max_value)


class ResBlock(nn.Module):
    def __init__(self, filters: int):
        super(ResBlock, self).__init__()
        self.conv = nn.Conv2d(filters, filters, kernel_size=3, padding=1)
        self.clipped_relu = ClippedReLU(max_value=20)
        self.bn = nn.BatchNorm2d(filters)
        self.identity = nn.Identity()

    def forward(self, x):
        x = self.conv(x)
        x = self.clipped_relu(self.bn(x))
        x = self.conv(x)
        x = self.bn(x)
        x = self.clipped_relu(x)
        x = self.identity(x)  # Add the residual (skip) connection
        out = self.clipped_relu(x)
        return F.relu(out)


class ConvResBlock(nn.Module):
    def __init__(self, filters: int):
        in_channel = filters // 2 if filters > 64 else 1
        super(ConvResBlock, self).__init__()
        self.conv = nn.Conv2d(in_channel, filters, kernel_size=5, padding=2, stride=2)
        self.clipped_relu = ClippedReLU(max_value=20)
        self.res_block = ResBlock(filters)
        self.bn = nn.BatchNorm2d(filters)
        self.identity = nn.Identity()

    def forward(self, x):
        x = self.conv(x)
        x = self.clipped_relu(self.bn(x))
        x = self.res_block(x)
        x = self.res_block(x)
        out = self.res_block(x)
        return out


class DeepSpeakerModel(nn.Module):
    def __init__(self, include_softmax=False):
        super(DeepSpeakerModel, self).__init__()
        self.include_softmax = include_softmax

        # Convolution and Residual blocks setup
        self.conv1 = ConvResBlock(64)
        self.conv2 = ConvResBlock(128)
        self.conv3 = ConvResBlock(256)
        self.conv4 = ConvResBlock(512)
        self.adaptive_avg_pool = nn.AdaptiveAvgPool2d((1, 1))

        # Dense and output layers
        self.flatten = nn.Flatten()
        self.dense1 = nn.Linear(512, 512)
        if include_softmax:
            self.dropout = nn.Dropout(0.5)
        else:
            self.norm = lambda x: F.normalize(x, p=2, dim=1)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.adaptive_avg_pool(x)

        # x = x.view(-1, 2048)

        x = self.flatten(x)
        # x = torch.mean(x, dim=1) 

        x = self.dense1(x)
        if self.include_softmax:
            x = self.dropout(x)
            x = self.output(x)
            x = F.softmax(x, dim=1)
        else:
            x = self.norm(x)
        return x


In [7]:
class VoicePairsDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        mfcc1, mfcc2, label = self.pairs[idx]
        return torch.tensor(mfcc1).t(), torch.tensor(mfcc2).t(), torch.tensor([label])

    @staticmethod
    def collate_fn(batch) -> Tuple:
        mfcc1s, mfcc2s, labels = zip(*batch)
        mfcc1s_padded = pad_sequence(mfcc1s, batch_first=True, padding_value=0).transpose(1, 2)
        mfcc2s_padded = pad_sequence(mfcc2s, batch_first=True, padding_value=0).transpose(1, 2)

        labels = torch.stack(labels)

        return mfcc1s_padded, mfcc2s_padded, labels


class VoiceTripletsDataset(Dataset):
    def __init__(self, pairs):
        self.triplets = pairs

    def __len__(self):
        return len(self.triplets)

    def __getitem__(self, idx):
        mfcc1, mfcc2, mfcc3 = self.triplets[idx]
        return torch.tensor(mfcc1).t(), torch.tensor(mfcc2).t(), torch.tensor(mfcc3).t()

    @staticmethod
    def collate_fn(batch) -> Tuple:
        mfcc1s, mfcc2s, mfcc3 = zip(*batch)
        mfcc1s_padded = pad_sequence(mfcc1s, batch_first=True, padding_value=0).transpose(1, 2).unsqueeze(1)
        mfcc2s_padded = pad_sequence(mfcc2s, batch_first=True, padding_value=0).transpose(1, 2).unsqueeze(1)
        mfcc3_padded = pad_sequence(mfcc3, batch_first=True, padding_value=0).transpose(1, 2).unsqueeze(1)

        return mfcc1s_padded, mfcc2s_padded, mfcc3_padded

In [8]:
class ConstrastiveLoss(nn.Module):
    def __init__(self, margin: float = 1.0):
        super(ConstrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = F.pairwise_distance(output1, output2)
        loss_contrastive = torch.mean((1 - label) * torch.pow(euclidean_distance, 0.5) +
                                      (label) * torch.pow(
            torch.clamp(self.margin - euclidean_distance, min=0.0), 0.5))
        loss_contrastive *= 1000
        return loss_contrastive


class TripletLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative):
        positive_distance = F.pairwise_distance(anchor, positive)
        negative_distance = F.pairwise_distance(anchor, negative)
        losses = F.relu(positive_distance - negative_distance + self.margin)
        return losses.mean()

In [9]:
def cosine_similarity(vec1, vec2):
    return torch.nn.functional.cosine_similarity(vec1.unsqueeze(0), vec2.unsqueeze(0))

In [28]:
def triplet_accuracy(anchor, positive, negative):
    pos_dist = torch.norm(anchor - positive, dim=1)
    neg_dist = torch.norm(anchor - negative, dim=1)
    return (pos_dist < neg_dist).float().mean()


def train_model(model: nn.Module, dataloaders: Dict, criterion, lr=0.001,
                epoches: int = 25, device: str = 'cuda', save_name: str = None) -> nn.Module:
    """
    Обучает модель и выводит информацию о процессе обучения.
   :param model: torch.nn.Module -  Модель для обучения.
   :param dataloaders: dict - Словарь содержащий 'train' и 'val' DataLoader.
   :param criterion: torch.nn.modules.loss - Функция потерь.
   :param lr: float
   :param epoches: int - Количество эпох обучения.
   :param device: str - Устройство для обучения ('cuda' или 'cpu').
    :return: nn.Module - Обученная модель.
    """
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    best_model_wts = model.state_dict()
    best_acc = 0.0
    stat = {}
    for epoch in range(epoches):
        print(f'Epoch {epoch + 1}/{epoches}')
        print('-' * 10)

        # Каждая эпоха имеет фазу обучения и валидации
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Установка модели в режим обучения
            else:
                model.eval()  # Установка модели в режим оценки

            running_loss = 0.0
            running_corrects = 0

            # Итерация по данным.
            for inputs1, inputs2, inputs3 in dataloaders[phase]:
                inputs1 = inputs1.to(device)
                inputs2 = inputs2.to(device)
                inputs3 = inputs3.to(device)

                # Обнуление градиентов параметров
                optimizer.zero_grad()

                # Прямой проход
                with torch.set_grad_enabled(phase == 'train'):
                    outputs1 = model(inputs1)
                    outputs2 = model(inputs2)
                    outputs3 = model(inputs3)
                    loss = criterion(outputs1, outputs2, outputs3)

                    # Обратное распространение и оптимизация только в фазе обучения
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                acc = triplet_accuracy(outputs1, outputs2, outputs3)
                # Статистика
                running_loss += loss.item() * inputs1.size(0)
                running_corrects += acc.item() * inputs1.size(0)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects / len(dataloaders[phase].dataset)
            stat[f'{epoch}'] = {'epoch_loss': epoch_loss,
                                'epoch_acc': epoch_acc}
            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # Копирование модели, если она показала лучшую точность
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = model.state_dict()
                if save_name:
                    torch.save(model.state_dict(), os.path.join('models', f'{save_name}_epo{epoch}.pth'))
                    json.dump(stat, open(os.path.join('models', f'{save_name}.json'), 'w'))
        print()
    print(f'Лучшая точность валидации: {best_acc:.4f}')

    # Загрузка лучших весов модели
    model.load_state_dict(best_model_wts)
    return model, stat

In [29]:
data_pairs = main.create_triplets(voice_params)
random.shuffle(data_pairs)
# Разделение на обучающую и валидационную выборки
val_size = int(0.2 * len(data_pairs))
data_train = data_pairs[val_size:]
data_val = data_pairs[:val_size]

dataset_train = VoiceTripletsDataset(data_train)
dataset_val = VoiceTripletsDataset(data_val)
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True,
                              collate_fn=VoiceTripletsDataset.collate_fn)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=True,
                            collate_fn=VoiceTripletsDataset.collate_fn)
dataloaders = {'train': dataloader_train,
               'val': dataloader_val}

In [30]:
model_name = 'DeepSpeaker_100p_3_40mfcc_001'
model = DeepSpeakerModel()
model, stat = train_model(model, dataloaders, TripletLoss(), epoches=50, save_name=model_name)
torch.save(model.state_dict(), os.path.join('models', f'{model_name}___end.pth'))

Epoch 1/30
----------
train Loss: 0.9895 Acc: 0.5250
val Loss: 1.0018 Acc: 0.4000

Epoch 2/30
----------
train Loss: 0.8058 Acc: 0.6000
val Loss: 1.0599 Acc: 0.5500

Epoch 3/30
----------
train Loss: 0.8442 Acc: 0.4750
val Loss: 0.9913 Acc: 0.5000

Epoch 4/30
----------
train Loss: 0.8841 Acc: 0.6750
val Loss: 0.9724 Acc: 0.6000

Epoch 5/30
----------
train Loss: 0.8139 Acc: 0.6875
val Loss: 0.9288 Acc: 0.5500

Epoch 6/30
----------
train Loss: 0.8147 Acc: 0.7500
val Loss: 0.9481 Acc: 0.6500

Epoch 7/30
----------
train Loss: 0.8074 Acc: 0.8750
val Loss: 0.9439 Acc: 0.6500

Epoch 8/30
----------
train Loss: 0.7953 Acc: 0.6375
val Loss: 0.8605 Acc: 0.5500

Epoch 9/30
----------
train Loss: 0.8466 Acc: 0.7500
val Loss: 1.0149 Acc: 0.4500

Epoch 10/30
----------
train Loss: 0.7746 Acc: 0.6500
val Loss: 0.9588 Acc: 0.8000

Epoch 11/30
----------
train Loss: 0.8553 Acc: 0.5875
val Loss: 0.9659 Acc: 0.8500

Epoch 12/30
----------
train Loss: 0.7893 Acc: 0.7000
val Loss: 0.9974 Acc: 1.0000

E

In [None]:
torch.save(model.state_dict(), f'speak_rec_20_256_128_15epo_triplets_001.pth')

In [24]:
len(voice_params['id10004'])

12

In [14]:
model = VoiceEmbeddingModel(15).to('cuda')
model.load_state_dict(torch.load(f'speak_rec_15_256_128_10epo.pth'))

<All keys matched successfully>

In [11]:
v1 = main.get_voice_mfccs(main.get_audio_for_id(data_path, id_list[10])[0], n_mfcc=15)


In [12]:
model.eval()

VoiceEmbeddingModel(
  (conv1): Conv1d(15, 128, kernel_size=(5,), stride=(1,), padding=(2,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(128, 128, kernel_size=(5,), stride=(1,), padding=(2,))
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (lstm1): LSTM(128, 128, batch_first=True)
  (fc1): Linear(in_features=128, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
)

In [27]:
roma1 = main.get_voice_mfccs(r'D:\University\Диссерт\val_data\Roma1.ogg', clear=True, clear_output=r'output/Roma1.wav',
                             n_mfcc=15)
roma1 = torch.tensor(roma1, dtype=torch.float32).unsqueeze(0).to('cuda')
roma1_emb = model(roma1)

In [28]:
roma2 = main.get_voice_mfccs(r'D:\University\Диссерт\val_data\Roma2.ogg', clear=True, clear_output=r'output/Roma2.wav',
                             n_mfcc=15)
roma2 = torch.tensor(roma2, dtype=torch.float32).unsqueeze(0).to('cuda')
roma2_emb = model(roma2)

In [29]:
rad1 = main.get_voice_mfccs(r'D:\University\Диссерт\val_data\Rad1.ogg', clear=True, clear_output=r'output/Rad1.wav',
                            n_mfcc=15)
rad1 = torch.tensor(rad1, dtype=torch.float32).unsqueeze(0).to('cuda')
rad1_emb = model(rad1)

In [38]:
rad2 = main.get_voice_mfccs(r'D:\University\Диссерт\val_data\Rad2.ogg', clear=True, clear_output=r'output/Rad2.wav',
                            n_mfcc=15)
rad2 = torch.tensor(rad2, dtype=torch.float32).unsqueeze(0).to('cuda')
rad2_emb = model(rad2)

In [32]:
cosine_similarity(roma2_emb, roma1_emb)

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1.]], device='cuda:0', grad_fn=<SumBackward1>)

In [20]:
embedding1 = torch.tensor([1.0, 2.0, 3.0])
embedding2 = torch.tensor([4.0, 5.0, 6.0])
cosine_similarity(embedding1, embedding2)

tensor([0.9746])

In [42]:
torch.norm(rad1_emb - roma2_emb)

tensor(7.5912, device='cuda:0', grad_fn=<LinalgVectorNormBackward0>)

In [43]:
rad2_emb

tensor([[ -1.3423,  -2.7562,  -8.6419, -27.2519,  18.2805, -27.3712,  23.5874,
         -28.8533,  14.0961, -18.4135,  27.2655, -32.8835, -13.7695,   1.5922,
         -13.4152,  29.1782, -32.8263,   1.3149,  20.3020,  -8.8157, -28.6114,
           0.4546,   2.9151,  -3.2998, -28.4159,  34.9340,  -0.6875,  -8.6741,
         -33.7749, -18.3857,  24.1713,  35.5761,  28.9477,  -5.7407, -26.5015,
          28.8841,  21.9906, -12.5852,  -2.6302,   6.4142, -20.8082, -30.0826,
         -29.2466,  -7.0453, -29.8049,   3.6251,  -1.7733,  23.2055,  21.0578,
           9.0684,   6.8325, -23.0020,  22.9182,  23.0082,  -4.6133, -22.5401,
          24.2017,  10.3489, -12.7686,  14.3108,   1.7776,  15.4300, -22.2818,
           2.1289,  32.1716,  11.0348,  25.2061,  12.7681, -36.9366,  -1.0287,
         -24.0626,   6.5162, -14.1568,  26.4261,  -0.5287,  16.0960,  20.7529,
         -20.9946, -27.2554,  23.9601,  24.5917, -35.4519,   0.2399,   7.1683,
           0.3806,  -4.1184,  18.4323, -20.9762, -20

In [48]:
F.pairwise_distance(rad2_emb, roma1_emb)

tensor([3.9091], device='cuda:0', grad_fn=<NormBackward1>)

0
