In [1]:
!pip install torch torchaudio pandas tqdm
!pip install -U gdown
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [2]:
!pip show torch

Name: torch
Version: 2.7.0+cu118
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3-Clause
Location: C:\Users\Samurai\miniconda3\Lib\site-packages
Requires: filelock, fsspec, jinja2, networkx, setuptools, sympy, typing-extensions
Required-by: asteroid-filterbanks, julius, lightning, nnAudio, pyannote.audio, pytorch-lightning, pytorch-metric-learning, speechbrain, torch-audiomentations, torch_pitch_shift, torchaudio, torchmetrics, torchvision


In [3]:
import os
import glob
import torch
import torchaudio
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [4]:
import torch
print(torch.version.cuda)
print(torch.cuda.is_available())
print(torch.backends.cudnn.version())

11.8
True
90100


In [5]:
# Download data set
file_path = 'C:/Users/Samurai/'

In [6]:
import os
import zipfile
#
# def extract_archives(file_path):
#     archives = {
#         'voxconverse-master.zip': 'voxconverse-master/',
#         'voxconverse_dev_wav.zip': 'dev_wav/',
#         'voxconverse_test_wav.zip': 'test_wav/'
#     }
#
#     for archive_name, extract_folder in archives.items():
#         zip_path = os.path.join(file_path, archive_name)
#         extract_path = os.path.join(file_path, extract_folder)
#         with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#             zip_ref.extractall(extract_path)
#
# extract_archives(file_path)

In [7]:
def read_rttm(rttm_path):
    cols = ['TYPE', 'FILE', 'CHANNEL', 'START', 'DURATION', 'NA1', 'NA2', 'SPEAKER', 'NA3', 'NA4']
    with open(rttm_path, 'r') as f:
        lines = [line.strip().split() for line in f if line.startswith('SPEAKER')]
    df = pd.DataFrame(lines, columns=cols)
    df['START'] = df['START'].astype(float)
    df['DURATION'] = df['DURATION'].astype(float)
    return df

In [8]:
def extract_segments(rttm_dir, wav_dir):
    rttm_files = sorted(glob.glob(os.path.join(rttm_dir, '*.rttm')))
    segments = []

    for rttm_path in tqdm(rttm_files):
        df = read_rttm(rttm_path)
        audio_id = os.path.splitext(os.path.basename(rttm_path))[0]
        audio_path = os.path.join(wav_dir, audio_id + '.wav')
        if not os.path.exists(audio_path):
            continue
        for _, row in df.iterrows():
            segments.append({
                'audio_path': audio_path,
                'start': row['START'],
                'end': row['START'] + row['DURATION'],
                'speaker': row['SPEAKER']
            })

    return pd.DataFrame(segments)


segments_df = extract_segments(
    file_path + 'voxconverse-master/voxconverse-master/dev',
    file_path + 'dev_wav/audio'
)

segments_df_test = extract_segments(
    file_path + 'voxconverse-master/voxconverse-master/test',
    file_path + 'test_wav/voxconverse_test_wav'
)

print(segments_df.head())
print(segments_df_test.head())

100%|██████████| 216/216 [00:01<00:00, 147.84it/s]
100%|██████████| 232/232 [00:01<00:00, 129.68it/s]

                                 audio_path   start     end speaker
0  C:/Users/Samurai/dev_wav/audio\abjxc.wav    0.40    7.04   spk00
1  C:/Users/Samurai/dev_wav/audio\abjxc.wav    8.68   64.64   spk00
2  C:/Users/Samurai/dev_wav/audio\afjiv.wav   41.12   80.48   spk00
3  C:/Users/Samurai/dev_wav/audio\afjiv.wav  140.64  141.64   spk01
4  C:/Users/Samurai/dev_wav/audio\afjiv.wav  142.20  144.32   spk01
                                          audio_path  start    end speaker
0  C:/Users/Samurai/test_wav/voxconverse_test_wav...   2.50   6.16   spk00
1  C:/Users/Samurai/test_wav/voxconverse_test_wav...   6.20  11.54   spk01
2  C:/Users/Samurai/test_wav/voxconverse_test_wav...  11.68  14.62   spk00
3  C:/Users/Samurai/test_wav/voxconverse_test_wav...  14.96  18.40   spk01
4  C:/Users/Samurai/test_wav/voxconverse_test_wav...  19.09  25.44   spk01





In [9]:
class VoxConverseSegmentDataset(Dataset):
    def __init__(self, segments, segment_duration=3.0, sample_rate=16000):
        self.segments = segments
        self.segment_duration = segment_duration
        self.sample_rate = sample_rate
        self.speaker_to_idx = {s: i for i, s in enumerate(sorted(segments['speaker'].unique()))}

    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        row = self.segments.iloc[idx]
        speaker_idx = self.speaker_to_idx[row['speaker']]
        audio_segment = self._load_segment(row['audio_path'], row['start'], row['end'])
        return audio_segment, speaker_idx

    def _load_segment(self, audio_path, start, end):
        waveform, sr = torchaudio.load(audio_path)
        if sr != self.sample_rate:
            waveform = torchaudio.functional.resample(waveform, sr, self.sample_rate)

        start_sample = int(start * self.sample_rate)
        end_sample = int(end * self.sample_rate)
        audio_segment = waveform[:, start_sample:end_sample]

        target_len = int(self.segment_duration * self.sample_rate)
        segment_len = audio_segment.shape[1]

        if segment_len < target_len:
            padding = target_len - segment_len
            audio_segment = torch.nn.functional.pad(audio_segment, (0, padding))
        else:
            audio_segment = audio_segment[:, :target_len]

        return audio_segment

In [10]:
def collate_fn(batch):
    audios, labels = zip(*batch)
    audios = torch.stack(audios)
    labels = torch.stack(labels)
    return audios, labels

In [11]:
dataset = VoxConverseSegmentDataset(segments_df)
loader = DataLoader(dataset, batch_size=4, pin_memory=True, shuffle=True)
batch = next(iter(loader))

In [12]:
BATCH_SIZE = 8

dataset_train = VoxConverseSegmentDataset(segments_df)
train_loader = DataLoader(dataset_train, batch_size=BATCH_SIZE, collate_fn = collate_fn, pin_memory=True, shuffle=True)

dataset_test = VoxConverseSegmentDataset(segments_df_test)
test_loader = DataLoader(dataset_test, batch_size=BATCH_SIZE, pin_memory=True, shuffle=False)


In [13]:
import torch.nn as nn

class SimpleSpeakerClassifier(nn.Module):
    def __init__(self, input_size, num_speakers, hidden_size=512):
        super().__init__()
        self.flatten = nn.Flatten()
        self.classifier = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_speakers)
        )

    def forward(self, x):
        x = self.flatten(x)
        return self.classifier(x)

In [14]:
num_speakers = len(dataset_train.speaker_to_idx)
input_size = 16000 * 3
model = SimpleSpeakerClassifier(input_size=input_size, num_speakers=num_speakers)

In [15]:
import torch.optim as optim

print(f"Dataset size: {len(train_loader.dataset)}")
print(f"Number of batches: {len(train_loader)}")

device = torch.device("cuda")
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for batch_idx, (audio_batch, labels) in enumerate(tqdm(train_loader)):
        audio_batch = audio_batch.to(device).squeeze(1)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(audio_batch)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if batch_idx % 50 == 0:
            print(f"[Epoch {epoch} | Batch {batch_idx}] Loss: {loss.item():.4f}")

    avg_loss = running_loss / len(train_loader)
    print(f"[Epoch {epoch}] Average Loss: {avg_loss:.4f}")


Dataset size: 8268
Number of batches: 1034


  0%|          | 0/1034 [00:00<?, ?it/s]


TypeError: expected Tensor as element 0 in argument 0, but got int

In [15]:
torch.save(model.state_dict(), 'speaker_model.pth')

In [16]:
model.eval()
total_samples = 0
correct_predictions = 0
total_loss = 0.0

with torch.no_grad():
    for audio_batch, labels in tqdm(test_loader):
        audio_batch = audio_batch.to(device).squeeze(1)
        labels = labels.to(device)

        outputs = model(audio_batch)

        predictions = outputs.argmax(dim=1)
        correct_predictions += (predictions == labels.argmax(dim=1)).sum().item()
        total_samples += labels.size(0)

accuracy = correct_predictions / total_samples if total_samples > 0 else 0.0

print(f"Test Accuracy: {accuracy:.4f}")


100%|██████████| 2435/2435 [00:15<00:00, 155.50it/s]

Test Accuracy: 0.2140





In [17]:
model.eval()
speaker_embeddings = []
total_batches = len(test_loader)

with torch.no_grad():
    for batch_idx, (audio_batch, labels) in enumerate(test_loader):
        print(f"Processing batch {batch_idx + 1}/{total_batches}...", end="\r")

        audio_batch = audio_batch.to(device).squeeze(1)
        emb = model(audio_batch, return_embedding=True)
        speaker_embeddings.append(emb.cpu())

speaker_embeddings = torch.cat(speaker_embeddings, dim=0)
print(f"Extracted {speaker_embeddings.shape[0]} embeddings")


Extracted 19478 embeddings...


In [18]:
import torch.nn.functional as F

e1 = speaker_embeddings[0]
e2 = speaker_embeddings[1]

similarity = F.cosine_similarity(e1.unsqueeze(0), e2.unsqueeze(0)).item()

print(f"Cosine similarity: {similarity:.4f}")


Cosine similarity: 0.8387
