## Setup

In [None]:
# Install required packages (run once)
!pip install pyaudio faster-whisper ollama pyttsx3

## Complete Offline Voice Assistant



In [None]:
import pyaudio
import wave
import numpy as np
from faster_whisper import WhisperModel
import ollama
import pyttsx3
import time

class OfflineAssistant:
    def __init__(self):
        # Audio config
        self.CHUNK = 1024
        self.FORMAT = pyaudio.paInt16
        self.CHANNELS = 1
        self.RATE = 16000
        self.SILENCE_LIMIT = 2  # Seconds of silence to stop recording
        
        # Initialize models
        print("Loading Whisper...")
        self.stt_model = WhisperModel("base.en", device="")
        print("Loading TTS...")
        self.tts = pyttsx3.init()
        self._setup_audio()

    def _setup_audio(self):
        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(
            format=self.FORMAT,
            channels=self.CHANNELS,
            rate=self.RATE,
            input=True,
            frames_per_buffer=self.CHUNK
        )

    def record_voice(self):
        """Record until silence is detected"""
        print("\nSpeak now...")
        frames = []
        silent_frames = 0
        silence_threshold = 500  # Adjust based on your mic
        
        while True:
            data = self.stream.read(self.CHUNK, exception_on_overflow=False)
            audio_data = np.frombuffer(data, dtype=np.int16)
            
            # Check for silence
            if np.abs(audio_data).mean() < silence_threshold:
                silent_frames += 1
                if silent_frames > self.SILENCE_LIMIT * (self.RATE/self.CHUNK):
                    break
            else:
                silent_frames = 0
                frames.append(data)
        
        return b''.join(frames)

    def transcribe(self, audio_bytes):
        """Convert speech to text"""
        with wave.open("temp.wav", "wb") as wf:
            wf.setnchannels(self.CHANNELS)
            wf.setsampwidth(2)  # 16-bit = 2 bytes
            wf.setframerate(self.RATE)
            wf.writeframes(audio_bytes)
        
        segments, _ = self.stt_model.transcribe("temp.wav")
        return " ".join(segment.text for segment in segments).strip()

    def respond(self, text):
        """Get AI response and speak"""
        if not text:
            return "I didn't hear that"
            
        if any(cmd in text.lower() for cmd in ["exit", "quit", "bye"]):
            return "Goodbye!"
        
        response = ollama.chat(
            model='mistral',
            messages=[{'role': 'user', 'content': text}]
        )
        return response['message']['content']

    def run(self):
        print("Ro Assistant Ready! (Say 'exit' to quit)")
        try:
            while True:
                # Record
                audio = self.record_voice()
                
                # Transcribe
                text = self.transcribe(audio)
                if not text:
                    print("No speech detected")
                    continue
                    
                print(f"You: {text}")
                
                # Respond
                reply = self.respond(text)
                print(f"Ro: {reply}")
                self.tts.say(reply)
                self.tts.runAndWait()
                
                if "goodbye" in reply.lower():
                    break
                    
        finally:
            self.stream.stop_stream()
            self.stream.close()
            self.p.terminate()

# Start the assistant
assistant = OfflineAssistant()
assistant.run()