# Voice Recognition Security System
Interface for voice-based access control using ML classification.

In [2]:
import tkinter as tk
from tkinter import filedialog
import librosa
import soundfile as sf
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.models import resnet18
import numpy as np
from IPython.display import Audio, display
import os
from df.enhance import enhance, init_df, load_audio, save_audio

# Initialize DeepFilter for noise reduction
model_df, df_state, _ = init_df()

[32m2024-11-25 17:49:11[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on torch 2.5.1+cu124[0m
[32m2024-11-25 17:49:11[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on host rafal-IdeaPad-Gaming-3-15ACH6[0m
[32m2024-11-25 17:49:11[0m | [1mINFO    [0m | [36mDF[0m | [1mGit commit: 89ba0c3, branch: feature/add-audio-upload-interface[0m
[32m2024-11-25 17:49:11[0m | [1mINFO    [0m | [36mDF[0m | [1mLoading model settings of DeepFilterNet3[0m
[32m2024-11-25 17:49:11[0m | [1mINFO    [0m | [36mDF[0m | [1mUsing DeepFilterNet3 model at /home/rafal/.cache/DeepFilterNet/DeepFilterNet3[0m
[32m2024-11-25 17:49:11[0m | [1mINFO    [0m | [36mDF[0m | [1mInitializing model `deepfilternet3`[0m


  from torchaudio.backend.common import AudioMetaData


[32m2024-11-25 17:49:11[0m | [1mINFO    [0m | [36mDF[0m | [1mFound checkpoint /home/rafal/.cache/DeepFilterNet/DeepFilterNet3/checkpoints/model_120.ckpt.best with epoch 120[0m
[32m2024-11-25 17:49:12[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on device cuda:0[0m
[32m2024-11-25 17:49:12[0m | [1mINFO    [0m | [36mDF[0m | [1mModel loaded[0m


  latest = torch.load(latest, map_location="cpu")


In [3]:
from ipywidgets import Button, Output, VBox
from IPython.display import display

In [None]:
def process_audio(audio_path, min_segment_length=4):
    """Process audio through preprocessing pipeline"""
    audio, sr = librosa.load(audio_path, sr=48000)
    enhanced = enhance(model_df, df_state, audio)
    non_silent = librosa.effects.split(enhanced, top_db=60)
    
    segments = []
    for start, end in non_silent:
        segment = enhanced[start:end]
        if len(segment) / sr >= min_segment_length:
            segments.append(segment)
            
    return segments, sr

def create_spectrogram(audio, sr):
    """Generate spectrogram from audio"""
    spectrogram = librosa.stft(audio)
    return librosa.amplitude_to_db(abs(spectrogram))

def prepare_for_model(spectrogram):
    """Convert spectrogram to model input format"""
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485], std=[0.229])
    ])
    
    spectrogram = torch.from_numpy(spectrogram)
    spectrogram = (spectrogram - spectrogram.min()) / (spectrogram.max() - spectrogram.min())
    spectrogram = (spectrogram * 255).type(torch.uint8)
    return transform(spectrogram).unsqueeze(0)

In [5]:
def load_model(model_path='trained_model3.pth'):
    """Load trained classification model"""
    model = resnet18()
    model.fc = nn.Linear(model.fc.in_features, 20)
    model.load_state_dict(torch.load(model_path, weights_only=True))
    model.eval()
    return model

model = load_model()
print("Model loaded successfully!")

Model loaded successfully!


In [6]:
def process_uploaded_file(file_path):
    """Process audio file and show results"""
    try:
        segments, sr = process_audio(file_path)
        print(f"Found {len(segments)} valid segments")
        
        authorized_count = 0
        for i, segment in enumerate(segments):
            spec = create_spectrogram(segment, sr)
            model_input = prepare_for_model(spec)
            
            with torch.no_grad():
                output = model(model_input)
                probs = torch.nn.functional.softmax(output, dim=1)
                is_authorized = bool(torch.argmax(output) in [0,1,2,3,4,5])
                confidence = probs[0][torch.argmax(output)].item() * 100
                
                if is_authorized:
                    authorized_count += 1
        
        final_authorized = authorized_count > len(segments) / 2
        print("\nAccess Decision:")
        print("✅ ACCESS GRANTED" if final_authorized else "❌ ACCESS DENIED")
        print(f"Confidence: {confidence:.2f}%")
        
        display(Audio(file_path))
        
    except Exception as e:
        print(f"Error processing file: {e}")

In [None]:
def upload_and_process():
    """Handle file selection and processing"""
    root = tk.Tk()
    root.withdraw()
    file_path = filedialog.askopenfilename(
        title='Select WAV File',
        filetypes=[('WAV files', '*.wav')]
    )
    if file_path:
        process_uploaded_file(file_path)

# Create interface
output = Output()
button = Button(description='Upload and Process WAV File')

def on_button_click(b):
    with output:
        output.clear_output()
        upload_and_process()

button.on_click(on_button_click)
display(button, output)

Button(description='Upload and Process WAV File', style=ButtonStyle())

Output()