In [2]:
#importing the necssary Libraries
import numpy as np
import librosa
import sounddevice as sd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import Dataset, DataLoader

In [3]:
#This class handles audio feature extraction and real-time audio recording.
class AudioProcessor:
    def __init__(self, sample_rate=16000):
        self.sample_rate = sample_rate
        
    def extract_features(self, audio):
        # Extract MFCC features and their delta (change over time) and delta-delta (second-order change)
        mfccs = librosa.feature.mfcc(y=audio, sr=self.sample_rate, n_mfcc=20)
        delta = librosa.feature.delta(mfccs)
        delta2 = librosa.feature.delta(mfccs, order=2)
        features = np.concatenate([mfccs, delta, delta2])  # Combine all features
        return features

    def record_audio(self, duration=5):
        # Record audio for the specified duration
        recording = sd.rec(int(duration * self.sample_rate), 
                         samplerate=self.sample_rate, channels=1)
        sd.wait()
        return recording.flatten()

In [4]:
class SpeakerDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.FloatTensor(features)
        self.labels = torch.LongTensor(labels)
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [5]:
class SpeakerVerificationModel(nn.Module):
    def __init__(self, input_size, hidden_size=128):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 2)  # Output: Binary classification (target or non-target)
        
    def forward(self, x):
        lstm_out, _ = self.lstm(x)  # Process input through LSTM
        output = self.fc(lstm_out[:, -1, :])  # Use the last timestep for prediction
        return output

In [6]:
class SpeakerVerifier:
    def __init__(self, sample_rate=16000):
        self.processor = AudioProcessor(sample_rate)
        self.model = None
        
    def train(self, target_recordings, non_target_recordings, epochs=50):
        # Extract features and labels for target and non-target recordings
        features = []
        labels = []
        
        for audio in target_recordings:
            feat = self.processor.extract_features(audio)
            features.append(feat.T)
            labels.append(1)  # Label for target speakers
            
        for audio in non_target_recordings:
            feat = self.processor.extract_features(audio)
            features.append(feat.T)
            labels.append(0)  # Label for non-target speakers
        
        # Create dataset and dataloader
        dataset = SpeakerDataset(features, labels)
        dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
        
        # Initialize model
        input_size = features[0].shape[1]  # Feature dimension
        self.model = SpeakerVerificationModel(input_size)
        
        # Training setup
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.model.parameters())
        
        # Training loop
        self.model.train()
        for epoch in range(epochs):
            total_loss = 0
            for batch_features, batch_labels in dataloader:
                optimizer.zero_grad()
                outputs = self.model(batch_features)
                loss = criterion(outputs, batch_labels)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
                
            if (epoch + 1) % 10 == 0:  # Print loss every 10 epochs
                print(f'Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}')
    
    def verify_speaker(self, audio, threshold=0.5):
        # Verify if the given audio is the target speaker
        self.model.eval()
        with torch.no_grad():
            features = self.processor.extract_features(audio)
            features = torch.FloatTensor(features.T).unsqueeze(0)
            output = torch.softmax(self.model(features), dim=1)
            probability = output[0][1].item()  # Probability of being target speaker
            return probability > threshold, probability
    
    def real_time_verification(self, duration=5):
        print("Press 'q' and hit Enter to quit real-time verification.")
        while True:
            # Prompt the user to continue or exit
            user_input = input("Press Enter to record, or 'q' to quit: ").strip().lower()
            if user_input == 'q':
                print("Exiting real-time verification.")
                break
            
            print("Recording...")
            audio = self.processor.record_audio(duration)
            is_target, confidence = self.verify_speaker(audio)
            result = "Target" if is_target else "Non-target"
            print(f"Speaker: {result} (confidence: {confidence:.2f})")


In [7]:
def evaluate_system(verifier, test_target, test_non_target):
    true_labels = []
    pred_labels = []
    
    for audio in test_target:
        is_target, _ = verifier.verify_speaker(audio)
        true_labels.append(1)
        pred_labels.append(1 if is_target else 0)
    
    for audio in test_non_target:
        is_target, _ = verifier.verify_speaker(audio)
        true_labels.append(0)
        pred_labels.append(1 if is_target else 0)
    
    return {
        'accuracy': accuracy_score(true_labels, pred_labels),
        'f1': f1_score(true_labels, pred_labels)
    }

In [9]:
if __name__ == "__main__":
    verifier = SpeakerVerifier()
    
    # Simulated training data (replace with real data for actual use)
    target_recordings = [np.random.randn(16000*5) for _ in range(10)]
    non_target_recordings = [np.random.randn(16000*5) for _ in range(10)]
    
    # Train the model
    verifier.train(target_recordings, non_target_recordings)
    
    # Start real-time verification (comment this out if not testing real-time)
    verifier.real_time_verification()

Epoch 10, Loss: 0.6750
Epoch 20, Loss: 0.6198
Epoch 30, Loss: 0.5007
Epoch 40, Loss: 0.3295
Epoch 50, Loss: 0.1962
Press 'q' and hit Enter to quit real-time verification.
Recording...
Speaker: Non-target (confidence: 0.46)
Exiting real-time verification.
