## Setup

In [10]:
!pip install faster-whisper pyttsx3 pyaudio ollama

^C


Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.22.0%2Bcu118-cp313-cp313-win_amd64.whl.metadata (6.3 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.7.0%2Bcu118-cp313-cp313-win_amd64.whl.metadata (6.8 kB)
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.0%2Bcu118-cp313-cp313-win_amd64.whl.metadata (29 kB)
Downloading https://download.pytorch.org/whl/cu118/torchvision-0.22.0%2Bcu118-cp313-cp313-win_amd64.whl (5.5 MB)
   ---------------------------------------- 0.0/5.5 MB ? eta -:--:--
   - -------------------------------------- 0.3/5.5 MB ? eta -:--:--
   --------------- ------------------------ 2.1/5.5 MB 7.1 MB/s eta 0:00:01
   ---------------------------------------- 5.5/5.5 MB 11.2 MB/s eta 0:00:00
Downloading https://download.pytorch.org/whl/cu118/torch-2.7.0%2Bcu118-cp313-cp313-win_amd64.whl (2908.4 MB)
   -

  You can safely remove it manually.


## Complete Offline Voice Assistant



In [None]:
import pyaudio
import wave
import numpy as np
from faster_whisper import WhisperModel
import ollama
import pyttsx3
import time
from collections import deque

class PremiumAssistant:
    def __init__(self):
        # Audio Configuration
        self.CHUNK = 1024
        self.FORMAT = pyaudio.paInt16
        self.CHANNELS = 1
        self.RATE = 16000
        self.SILENCE_THRESHOLD = 300  # Lower for better sensitivity
        self.MIN_VOICE_DURATION = 0.8  # Seconds
        self.MAX_RECORD_SECONDS = 10  # Safety limit
        
        # Conversation Control
        self.EXIT_COMMANDS = {
            "exit", "quit", "bye", "goodbye", 
            "stop", "end", "terminate", "i'm done",
            "that's all", "close", "shutdown"
        }
        
        # Initialize with GPU acceleration
        print("🚀 Loading Premium Whisper model...")
        self.stt_model = WhisperModel(
            "medium.en",  # More accurate than tiny/base
            device="cuda",  # Use GPU
            compute_type="float16"  # Optimize for GPU
        )
        
        print("🔊 Initializing TTS Engine...")
        self.tts = pyttsx3.init()
        self._configure_tts()
        self._init_audio()
        
        # Context memory for better conversations
        self.conversation_history = deque(maxlen=5)

    def _configure_tts(self):
        """Improve voice output quality"""
        voices = self.tts.getProperty('voices')
        self.tts.setProperty('voice', voices[1].id)  # Better voice
        self.tts.setProperty('rate', 150)  # Slightly slower
        self.tts.setProperty('volume', 0.9)  # Louder

    def _init_audio(self):
        """Initialize audio with best available mic"""
        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(
            format=self.FORMAT,
            channels=self.CHANNELS,
            rate=self.RATE,
            input=True,
            frames_per_buffer=self.CHUNK,
            input_device_index=self._get_best_mic()
        )

    def _get_best_mic(self):
        """Select the best available microphone"""
        info = self.p.get_host_api_info_by_index(0)
        num_devices = info.get('deviceCount')
        
        for i in range(num_devices):
            dev = self.p.get_device_info_by_host_api_device_index(0, i)
            if dev.get('maxInputChannels') > 0:
                print(f"🎤 Found mic: {dev['name']}")
                return i
        return 0

    def _dynamic_silence_detection(self, audio_chunk):
        """Adaptive silence detection"""
        rms = np.sqrt(np.mean(np.square(audio_chunk)))
        return rms < self.SILENCE_THRESHOLD

    def record_voice(self):
        """Smart recording with adaptive silence detection"""
        print("\n🔴 Recording... (Speak now)")
        frames = []
        silent_chunks = 0
        voice_detected = False
        start_time = time.time()
        
        while (time.time() - start_time) < self.MAX_RECORD_SECONDS:
            data = self.stream.read(self.CHUNK, exception_on_overflow=False)
            audio_chunk = np.frombuffer(data, dtype=np.int16)
            frames.append(data)
            
            # Voice activity detection
            if self._dynamic_silence_detection(audio_chunk):
                if voice_detected:
                    silent_chunks += 1
                    if silent_chunks > 5:  # End of speech
                        break
            else:
                voice_detected = True
                silent_chunks = 0
        
        return b''.join(frames)

    def transcribe(self, audio_bytes):
        """Enhanced transcription with word confidence"""
        with wave.open("temp.wav", "wb") as wf:
            wf.setnchannels(self.CHANNELS)
            wf.setsampwidth(2)
            wf.setframerate(self.RATE)
            wf.writeframes(audio_bytes)
        
        # High-accuracy transcription
        segments, info = self.stt_model.transcribe(
            "temp.wav",
            beam_size=5,
            best_of=5,
            vad_filter=True,
            vad_parameters=dict(min_silence_duration_ms=500)
        )
        
        # Only keep high-confidence words
        clean_text = " ".join(
            segment.text for segment in segments 
            if segment.no_speech_prob < 0.4  # Filter unsure segments
        )
        return clean_text.strip()

    def generate_response(self, text):
        """Smart response generation with context"""
        if not text:
            return "I didn't catch that. Could you repeat?"
            
        # Check exit commands
        if any(cmd in text.lower() for cmd in self.EXIT_COMMANDS):
            return "Goodbye! It was great chatting with you."
        
        # Add to conversation history
        self.conversation_history.append(f"User: {text}")
        
        # Craft prompt with context
        prompt = (
            "Respond concisely (1-2 sentences max). "
            "Current conversation:\n" +
            "\n".join(self.conversation_history) +
            "\nAssistant:"
        )
        
        response = ollama.chat(
            model='mistral',
            messages=[{
                'role': 'user',
                'content': prompt,
                'options': {
                    'temperature': 0.7,  # More creative
                    'num_ctx': 2048  # Larger context window
                }
            }]
        )
        
        return response['message']['content']

    def run(self):
        print("\n" + "🌟"*30)
        print("   PREMIUM VOICE ASSISTANT ACTIVATED")
        print("🌟"*30 + "\n")
        print("Say one of these to exit:")
        print(", ".join(sorted(self.EXIT_COMMANDS)))
        
        try:
            while True:
                # Record & process
                audio = self.record_voice()
                text = self.transcribe(audio)
                
                if not text:
                    print("🔇 No speech detected")
                    continue
                    
                print(f"\nYou: {text}")
                
                # Generate & speak response
                response = self.generate_response(text)
                print(f"🤖 Ro: {response}")
                
                self.tts.say(response)
                self.tts.runAndWait()
                
                # Check for exit
                if any(cmd in response.lower() for cmd in ["goodbye", "exit"]):
                    break
                    
        except KeyboardInterrupt:
            self.tts.say("Assistant shutting down")
            self.tts.runAndWait()
        finally:
            self.stream.stop_stream()
            self.stream.close()
            self.p.terminate()

# Launch the premium assistant
assistant = PremiumAssistant()
assistant.run()