# Voice Recognition Security System
Interface for voice-based access control using ML classification.

In [None]:
import tkinter as tk
from tkinter import filedialog
import torch
import torch.nn as nn
from torchvision.models import resnet18
from IPython.display import Audio, display
import os
import numpy as np
import librosa
from PIL import Image
from torchvision import transforms
from torch.utils.data import TensorDataset, DataLoader
from df.enhance import init_df, enhance
from resample_audio_and_clear_of_noise import re_sample_audio, is_valid_wav_file

LOCATORS_SPEAKERS_LIST = ["f1", "f7", "f8", "m3", "m6", "m8"]

# Initialize models
model_df, df_state, _ = init_df()

# Load classification model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = resnet18()
model.fc = nn.Linear(model.fc.in_features, 20)  # Original number of classes
model.load_state_dict(torch.load('trained_model3.pth', map_location=device))
model.to(device)
model.eval()

In [None]:
def process_file(file_path):
    """Process audio file and verify identity"""
    try:
        if not is_valid_wav_file(file_path):
            print("Invalid WAV file")
            return

        # Resample audio to 48kHz
        re_sample_audio(file_path)
        
        # Get audio segments using silence removal function
        audio, sr = librosa.load(file_path, sr=48000)
        audio_tensor = torch.from_numpy(audio).float().unsqueeze(0)
        enhanced = enhance(model_df, df_state, audio_tensor)
        enhanced_numpy = enhanced.squeeze(0).cpu().numpy()
        
        segments = librosa.effects.split(enhanced_numpy, top_db=60)
        print(f"Found {len(segments)} segments")

        authorized_count = 0
        processed_segments = 0
        
        for i, (start, end) in enumerate(segments):
            segment = enhanced_numpy[start:end]
            if len(segment) / sr < 4:  # Skip segments shorter than 4 seconds
                continue
                
            try:
                # Generate spectrogram
                spec = librosa.stft(segment)
                spec_db = librosa.amplitude_to_db(abs(spec))
                
                # Normalize
                spec_norm = (spec_db - spec_db.min()) / (spec_db.max() - spec_db.min())
                
                # Convert to PIL image and resize to fixed dimensions
                spec_image = Image.fromarray(spec_norm).resize((224, 224))
                spec_tensor = transforms.ToTensor()(spec_image)
                spec_tensor = spec_tensor.repeat(3, 1, 1)  # Convert to 3 channels
                spec_tensor = spec_tensor.unsqueeze(0)  # Add batch dimension
                spec_tensor = spec_tensor.to(device)
                
                # Get prediction
                with torch.no_grad():
                    output = model(spec_tensor)
                    probs = torch.nn.functional.softmax(output, dim=1)
                    pred_idx = torch.argmax(output).item()
                    confidence = probs[0][pred_idx].item() * 100
                    
                    is_authorized = pred_idx < 6  # First 6 classes are authorized speakers
                    if is_authorized:
                        authorized_count += 1
                        speaker = LOCATORS_SPEAKERS_LIST[pred_idx]
                    else:
                        speaker = f"unauthorized (class {pred_idx})"
                    
                    print(f"Segment {i+1}: {speaker} (Confidence: {confidence:.2f}%)")
                    processed_segments += 1
                
            except Exception as e:
                print(f"Error processing segment: {str(e)}")
                continue
            
        if processed_segments > 0:
            final_authorized = authorized_count > processed_segments / 2
            print("\nAccess Decision:")
            print("✅ ACCESS GRANTED" if final_authorized else "❌ ACCESS DENIED")
            print(f"Authorized segments: {authorized_count}/{processed_segments}")
            
            display(Audio(file_path))
            
    except Exception as e:
        print(f"Error processing file: {str(e)}")

In [None]:
from ipywidgets import Button, Output

output = Output()
button = Button(description='Upload and Process WAV File')

def on_button_click(b):
    with output:
        output.clear_output()
        root = tk.Tk()
        root.withdraw()
        file_path = filedialog.askopenfilename(
            title='Select WAV File',
            filetypes=[('WAV files', '*.wav')]
        )
        if file_path:
            process_file(file_path)

button.on_click(on_button_click)
display(button, output)