# Voice Recognition Security System
Interface for voice-based access control using ML classification.

In [1]:
import os
import torch
import torch.nn as nn
import librosa
import numpy as np
from PIL import Image
from IPython.display import Audio, display
from torchvision import transforms
import tkinter as tk
from tkinter import filedialog
from resample_audio_and_clear_of_noise import re_sample_audio, is_valid_wav_file
from torchvision.models import resnet18
from silence_removal import process_audio_file
from create_spectrogram import process_audio_file as specotgram_process
from df.enhance import enhance, init_df, load_audio, save_audio

# Constants
LOCATORS_SPEAKERS_LIST = ["f1", "f7", "f8", "m3", "m6", "m8"]
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MODEL_PATH = 'trained_model9.pth'

# Initialize DeepFilter for noise reduction
model_df, df_state, _ = init_df()

# Transform for spectrogram processing
spec_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

  from torchaudio.backend.common import AudioMetaData


[32m2025-01-29 00:13:21[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on torch 2.5.1+cu124[0m
[32m2025-01-29 00:13:21[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on host rafal-IdeaPad-Gaming-3-15ACH6[0m
[32m2025-01-29 00:13:21[0m | [1mINFO    [0m | [36mDF[0m | [1mGit commit: 270ef51, branch: main[0m
[32m2025-01-29 00:13:21[0m | [1mINFO    [0m | [36mDF[0m | [1mLoading model settings of DeepFilterNet3[0m
[32m2025-01-29 00:13:21[0m | [1mINFO    [0m | [36mDF[0m | [1mUsing DeepFilterNet3 model at /home/rafal/.cache/DeepFilterNet/DeepFilterNet3[0m
[32m2025-01-29 00:13:21[0m | [1mINFO    [0m | [36mDF[0m | [1mInitializing model `deepfilternet3`[0m
[32m2025-01-29 00:13:21[0m | [1mINFO    [0m | [36mDF[0m | [1mFound checkpoint /home/rafal/.cache/DeepFilterNet/DeepFilterNet3/checkpoints/model_120.ckpt.best with epoch 120[0m
[32m2025-01-29 00:13:22[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on device cuda:0[0m
[32m2025-01-29 00:13:2

  latest = torch.load(latest, map_location="cpu")


In [2]:
def initialize_model():
    """Initialize and load the ResNet18 model"""
    model = resnet18()
    model.fc = nn.Linear(model.fc.in_features, len(LOCATORS_SPEAKERS_LIST) + 1)
    state_dict = torch.load(MODEL_PATH, map_location=DEVICE)
    model.load_state_dict(state_dict)
    model.to(DEVICE)
    model.eval()
    return model

# Initialize the classification model
classification_model = initialize_model()

  state_dict = torch.load(MODEL_PATH, map_location=DEVICE)


In [3]:
def delete_noise_for_file(audio_path, model, df_state):
    """Process and remove noise from a single audio file."""
    try:
        if not is_valid_wav_file(audio_path):
            print(f"Skipping invalid WAV file: {audio_path}")
            return
        
        audio, _ = load_audio(audio_path, sr=df_state.sr())
        enhanced = enhance(model, df_state, audio)
        
        enhanced_audio_path = audio_path.replace('.wav', '_enhanced.wav') 
        save_audio(enhanced_audio_path, enhanced, df_state.sr())
        
        print(f"Processed: {audio_path}")
        return enhanced_audio_path
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")

def classify_segment_from_path(spectrogram_path, model):
    """Classify a spectrogram image from a file path using the ResNet model."""
    try:
        spec_image = Image.open(spectrogram_path).convert('RGB')
        spec_tensor = spec_transform(spec_image).unsqueeze(0).to(DEVICE)
        
        with torch.no_grad():
            output = model(spec_tensor)
            probs = torch.nn.functional.softmax(output, dim=1)
            pred_idx = torch.argmax(probs, dim=1).item()
            confidence = probs[0][pred_idx].item() * 100
        
        return pred_idx, confidence
    
    except Exception as e:
        print(f"Error during classification: {e}")
        return None, None

In [4]:
def process_file(file_path):
    """Process an audio file through the complete pipeline: enhance, split, and classify."""
    try:
        print("Starting audio processing...")
        
        # Resample the audio
        print("Resampling audio...")
        resampled_path = re_sample_audio(file_path)
        
        # Remove noise
        print("Removing noise...")
        enhanced_path = delete_noise_for_file(resampled_path, model_df, df_state)
        
        # Split into segments
        print("Splitting into segments...")
        audio_paths = process_audio_file(enhanced_path)
        
        # Process each segment
        authorized_count = 0
        total_segments = 0
        
        print("\nAnalyzing segments:")
        for audio_path in audio_paths:
            spectrogram_path = specotgram_process(audio_path, "temp")
            predicted_class, confidence = classify_segment_from_path(spectrogram_path, classification_model)
            
            if predicted_class is not None:
                total_segments += 1
                speaker = LOCATORS_SPEAKERS_LIST[predicted_class] if predicted_class < 6 else "Unauthorized"
                print(f"Segment {total_segments}: {speaker} (Confidence: {confidence:.2f}%)")
                
                if predicted_class < 6:
                    authorized_count += 1
        
        # Make final decision
        if total_segments > 0:
            print("\nAccess Decision:")
            if authorized_count > total_segments / 2:
                print("✅ ACCESS GRANTED")
            else:
                print("❌ ACCESS DENIED")
            print(f"Authorized segments: {authorized_count}/{total_segments}")
            
            # Play the original audio
            display(Audio(file_path))
        else:
            print("No valid segments found for analysis")
            
    except Exception as e:
        print(f"Error in processing: {str(e)}")

In [5]:
from ipywidgets import Button, Output

output = Output()
button = Button(description='Upload and Process WAV File')

def on_button_click(b):
    with output:
        output.clear_output()
        root = tk.Tk()
        root.withdraw()
        file_path = filedialog.askopenfilename(
            title='Select WAV File',
            filetypes=[('WAV files', '*.wav')]
        )
        if file_path:
            process_file(file_path)

button.on_click(on_button_click)
display(button, output)

Button(description='Upload and Process WAV File', style=ButtonStyle())

Output()