<a href="https://colab.research.google.com/github/Sazim2019331087/voice_model/blob/main/customized_voice_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing necessary libraries

In [95]:
!pip install torch torchaudio pandas scikit-learn pydub



# Importing libraries

In [96]:
import os
import torch
import torchaudio
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import Dataset, DataLoader, random_split

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from pydub import AudioSegment
from google.colab import files

# Loading Speaker Details

In [97]:
csv_file = "main_data.csv"
df = pd.read_csv(csv_file)

# Extracting MFCC features

In [98]:
import torch.nn.functional as F

def extract_mfcc(file_path, n_mfcc=13, n_mels=40, fixed_length=100):
    try:
        waveform, sample_rate = torchaudio.load(file_path)

        # Convert to mono if multi-channel
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # Handle empty audio files
        if waveform.shape[1] == 0:
            print(f"Warning: Empty audio file {file_path}")
            return torch.zeros(n_mfcc, fixed_length)

        # Extract MFCC features
        mfcc = torchaudio.transforms.MFCC(
            sample_rate=sample_rate,
            n_mfcc=n_mfcc,
            melkwargs={'n_mels': n_mels}
        )(waveform)

        # Convert shape from (1, n_mfcc, time_steps) → (n_mfcc, time_steps)
        mfcc = mfcc.squeeze(0)

        # Fix shape: Padding or truncation
        if mfcc.shape[1] < fixed_length:
            mfcc = F.pad(mfcc, (0, fixed_length - mfcc.shape[1]), "constant", 0)
        else:
            mfcc = mfcc[:, :fixed_length]

        return mfcc

    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return torch.zeros(n_mfcc, fixed_length)  # Return zeros if error occurs

# Applying Label Encoding for speakers

In [99]:
label_encoder = LabelEncoder()
df['speaker_id'] = label_encoder.fit_transform(df['email'])

# Saving Label Encodings

In [100]:
torch.save(label_encoder, "label_encoder.pth")

# Creating Custom Dataset of Speakers

In [101]:
class SpeakerDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.labels = torch.tensor(dataframe['speaker_id'].values, dtype=torch.long)

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        audio_file = os.path.join("voices", self.dataframe.iloc[idx]["audio_file"])  # Fix column name
        mfcc = extract_mfcc(audio_file)  # Extract MFCC features
        return mfcc, self.labels[idx]  # Do not flatten, let collate_batch handle padding

# Creation of dataset

In [102]:
dataset = SpeakerDataset(df)

# Spliting Dataset into Train & Test

In [103]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Creating Custom Collating Batch for Padding

In [114]:
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    features, labels = zip(*batch)

    # Convert features to tensors and pad sequences
    features = [torch.tensor(f, dtype=torch.float32) for f in features]
    features_padded = pad_sequence(features, batch_first=True, padding_value=0)

    # Flatten the feature dimension to match model input (batch_size, time_steps * n_mfcc)
    features_padded = features_padded.view(features_padded.shape[0], -1)

    labels = torch.tensor(labels, dtype=torch.long)
    return features_padded, labels

# Creating Data Loaders

In [116]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [117]:
sample_features, _ = next(iter(train_loader))  # Get a batch of data
input_size = sample_features.shape[-1]  # Extract feature dimension (MFCC coefficients)
num_classes = len(df["speaker_id"].unique())  # Get number of unique speakers
print(f"Updated input_size: {input_size}, Number of classes: {num_classes}")

Updated input_size: 1300, Number of classes: 38


  features = [torch.tensor(f, dtype=torch.float32) for f in features]


# Defining Neural Network Model

In [118]:
import torch.nn as nn

class SpeakerClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SpeakerClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Getting Input Size from sample

In [119]:
sample_mfcc, _ = dataset[0]
input_size = sample_mfcc.numel()  # Fix input_size calculation
num_classes = len(df['speaker_id'].unique())

# Model Training

In [126]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SpeakerClassifier(input_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training Loop
num_epochs = 30

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for features, labels in train_loader:
        features = features.to(device).float()
        labels = labels.to(device).long()

        optimizer.zero_grad()
        outputs = model(features)

        # Debugging Check
        print(f"Output Shape: {outputs.shape}, Label Shape: {labels.shape}")

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")

  features = [torch.tensor(f, dtype=torch.float32) for f in features]


Output Shape: torch.Size([30, 38]), Label Shape: torch.Size([30])
Epoch [1/30], Loss: 44.9646
Output Shape: torch.Size([30, 38]), Label Shape: torch.Size([30])
Epoch [2/30], Loss: 50.4056
Output Shape: torch.Size([30, 38]), Label Shape: torch.Size([30])
Epoch [3/30], Loss: 37.4692
Output Shape: torch.Size([30, 38]), Label Shape: torch.Size([30])
Epoch [4/30], Loss: 32.3845
Output Shape: torch.Size([30, 38]), Label Shape: torch.Size([30])
Epoch [5/30], Loss: 27.2090
Output Shape: torch.Size([30, 38]), Label Shape: torch.Size([30])
Epoch [6/30], Loss: 19.6908
Output Shape: torch.Size([30, 38]), Label Shape: torch.Size([30])
Epoch [7/30], Loss: 16.0204
Output Shape: torch.Size([30, 38]), Label Shape: torch.Size([30])
Epoch [8/30], Loss: 12.5201
Output Shape: torch.Size([30, 38]), Label Shape: torch.Size([30])
Epoch [9/30], Loss: 10.9184
Output Shape: torch.Size([30, 38]), Label Shape: torch.Size([30])
Epoch [10/30], Loss: 9.6930
Output Shape: torch.Size([30, 38]), Label Shape: torch.Size(

# Saving the trained Model

In [130]:
# Define the directory where "main_data.csv" is stored
DATA_PATH = "/content"  # Same directory as main_data.csv

# Ensure the directory exists
os.makedirs(DATA_PATH, exist_ok=True)

# Save model
model_path = os.path.join(DATA_PATH, 'speaker_classifier.pth')
torch.save(model.state_dict(), model_path)

# Load model (if needed)
model.load_state_dict(torch.load(model_path))
model.to(device)
#model.eval()

print(f"Model saved at: {model_path}")

Model saved at: /content/speaker_classifier.pth


  model.load_state_dict(torch.load(model_path))


# Model Evaluation

In [131]:
model.eval()
correct, total = 0, 0

with torch.no_grad():
    for features, labels in test_loader:
        features = torch.stack([torch.tensor(f, dtype=torch.float32) for f in features]).to(device)
        labels = labels.to(device).long()

        outputs = model(features)
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

print(f"Test Accuracy: {100 * correct / total:.2f}%")

Test Accuracy: 0.00%


  features = [torch.tensor(f, dtype=torch.float32) for f in features]
  features = torch.stack([torch.tensor(f, dtype=torch.float32) for f in features]).to(device)


# Speaker Predictions

In [139]:
def predict_speaker(file_path, model, label_encoder, fixed_length=1300, device="cuda" if torch.cuda.is_available() else "cpu"):
    model.to(device)
    model.eval()  # Set model to evaluation mode

    try:
        # Extract MFCC using your existing function
        mfcc = extract_mfcc(file_path).to(device)  # Move to correct device

        # Flatten MFCC to match model input
        mfcc = mfcc.view(1, -1)  # Shape: (1, feature_size)

        # Ensure fixed input size (truncate or pad)
        current_size = mfcc.shape[1]
        if current_size < fixed_length:
            pad_size = fixed_length - current_size
            mfcc = F.pad(mfcc, (0, pad_size), "constant", 0)  # Pad with zeros
        else:
            mfcc = mfcc[:, :fixed_length]  # Truncate

        # Predict speaker
        with torch.no_grad():
            output = model(mfcc)
            _, predicted = torch.max(output, 1)

        # Decode predicted label
        speaker_email = label_encoder.inverse_transform([predicted.item()])[0]

        print(f"Predicted Speaker: {speaker_email}")
        return speaker_email

    except Exception as e:
        print(f"Error predicting speaker: {e}")
        return None  # Return None if prediction fails


# Testing

In [145]:
# Example usage
query_file = os.path.join("/content", "6759b4871227a.wav")  # Ensure correct path
predict_speaker("/content/voices/6759b4871227a.wav", model, label_encoder)

Predicted Speaker: royt26850@gmail.com




'royt26850@gmail.com'