In [None]:
import os                                  # For file path operations
import csv                                 # For CSV reading/writing
import io                                  # For in-memory binary streams
import whisper                             # For transcription using the Whisper model
import sounddevice as sd                   # For audio recording from the microphone
import numpy as np                         # For numerical operations (arrays)
import time                                # For timekeeping and delays
import webrtcvad                           # For Voice Activity Detection (VAD)
import re                                  # For regular expression text cleaning
import pandas as pd                        # For DataFrame operations (CSV processing)
import requests                            # For HTTP requests (LLM API calls)
from concurrent.futures import ThreadPoolExecutor  # For concurrent processing (used later)
import simpleaudio as sa                   # For audio playback
import soundfile as sf                     # For reading audio files
import threading                           # For multithreading (monitoring mic)
from TTS.api import TTS                    # For text-to-speech conversion
import queue                               # For thread-safe task queue

########################################
# Global Settings and Utility Functions
########################################

# Define the CSV file path on the Desktop
csv_path = os.path.join(os.path.expanduser("~"), "Desktop", "transcriptions.csv")

# Create a lock for thread-safe CSV writes
csv_lock = threading.Lock()

# (The transcription_queue and background worker below remain for legacy or future use.)
transcription_queue = queue.Queue()

# Initialize the Whisper model for transcription
whisper_model = whisper.load_model("small")
sd.default.device = 0  # Use the default microphone for recording

# Setup WebRTC VAD for transcription (and reuse for playback later)
vad = webrtcvad.Vad()
vad.set_mode(2)  # Set VAD sensitivity (0 = least sensitive, 3 = most sensitive)

# Audio recording settings
SAMPLE_RATE = 16000                      # Audio sample rate in Hz
FRAME_DURATION = 30                      # Frame duration in milliseconds
FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION / 1000)  # Number of samples per frame

def is_speech(frame):
    """
    Check if the given audio frame contains speech using VAD.
    """
    return vad.is_speech(frame.tobytes(), SAMPLE_RATE)

def record_audio(timeout=None):
    """
    Record audio from the microphone until a period of silence is detected.
    If a timeout is provided and no speech is detected within that time (before recording starts),
    returns None.
    """
    print("🎤 Transcription Phase: Listening...")
    buffer = []              # To store recorded frames
    silence_count = 0        # Counter for consecutive silent frames
    recording = False        # Flag to indicate if recording has started
    start_time = time.time() if timeout is not None else None
    with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype="int16") as stream:
        while True:
            if timeout is not None and not recording and (time.time() - start_time > timeout):
                print("⏰ Timeout reached without speech detection.")
                return None
            frame, _ = stream.read(FRAME_SIZE)
            frame = frame[:, 0]  # Convert stereo to mono if needed
            if is_speech(frame):
                buffer.append(frame)
                silence_count = 0
                if not recording:
                    print("🎙️ Speech detected, recording...")
                    recording = True
            elif recording:
                silence_count += 1
                # Approximately 3 seconds of silence (3 sec / 0.03 sec per frame = ~100 frames)
                if silence_count > 100:
                    print("🔇 Speech ended, processing segment...")
                    break
    if buffer:
        return np.concatenate(buffer).astype(np.float32) / 32768.0
    return None

def clean_text(text):
    """
    Clean input text by removing unsupported characters and ensuring sufficient content.
    """
    cleaned = re.sub(r'[^\w\s,.!?-]', '', text)
    if len(cleaned.split()) < 3:
        return "Please provide more details."
    return cleaned

########################################
# Utility Function for Dynamic RMS Filtering (Removing Long Silences Only)
########################################

def filter_audio_by_rms(audio, frame_size=FRAME_SIZE, ema_alpha=0.1, multiplier=0.5, allowed_silence_frames=3):
    """
    Processes the audio frame-by-frame and removes long periods of silence.
    For each frame, an exponential moving average (EMA) of the RMS values is computed.
    The dynamic threshold for each frame is set to (EMA * multiplier).

    Frames with RMS above the dynamic threshold are kept.
    For frames below the threshold, only up to allowed_silence_frames consecutive silent frames are preserved.
    This removes long periods of background noise while preserving short pauses.
    """
    processed_frames = []
    silence_buffer = []
    ema = None  # Initialize EMA
    
    for i in range(0, len(audio), frame_size):
        frame = audio[i:i+frame_size]
        rms = np.sqrt(np.mean(frame**2))
        # Initialize or update EMA
        if ema is None:
            ema = rms
        else:
            ema = ema_alpha * rms + (1 - ema_alpha) * ema
        
        # Dynamic threshold is a fraction (multiplier) of the current EMA value.
        dynamic_threshold = ema * multiplier
        
        if rms >= dynamic_threshold:
            # If there is any buffered silence, retain up to allowed silence frames
            if silence_buffer:
                processed_frames.extend(silence_buffer[-allowed_silence_frames:])
                silence_buffer = []
            processed_frames.append(frame)
        else:
            silence_buffer.append(frame)
    
    # At the end, add up to allowed silence frames if any remain
    if silence_buffer:
        processed_frames.extend(silence_buffer[-allowed_silence_frames:])
    if processed_frames:
        return np.concatenate(processed_frames)
    return np.array([], dtype=np.float32)

########################################
# (Legacy) Background Transcription Worker
########################################

def transcription_worker():
    """
    Background worker that continuously processes audio segments from the queue.
    For each segment, it performs transcription and writes the result to the CSV.
    """
    while True:
        segment = transcription_queue.get()
        if segment is None:
            break  # Sentinel to shut down
        result = whisper_model.transcribe(segment, fp16=False)
        transcription = result.get("text", "").strip()
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
        with csv_lock:
            with open(csv_path, "a", newline="") as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow([timestamp, transcription, False])
        print("📝 Transcription:", transcription, "\n")
        transcription_queue.task_done()

worker_thread = threading.Thread(target=transcription_worker, daemon=True)
worker_thread.start()

########################################
# Phase 1: Transcription (Modified for Concurrent Processing)
########################################

def transcribe_segment(segment):
    """Helper function to transcribe a given audio segment using the Whisper model."""
    result = whisper_model.transcribe(segment, fp16=False)
    return result.get("text", "").strip()

def transcribe_audio(max_duration=60):
    """
    Run the transcription phase for up to max_duration seconds.
    As soon as an audio segment is captured, it is submitted for transcription concurrently.
    The system does not wait for the previous transcription to finish before starting to listen for the next segment.
    Once a session ends (3 seconds of silence), all concurrent transcriptions are awaited,
    and the non-blank results are joined as a single paragraph and written to CSV.
    """
    start_time = time.time()
    
    # Create CSV file with header if it doesn't exist
    if not os.path.exists(csv_path):
        with open(csv_path, "w", newline="") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["Timestamp", "Transcription", "Played"])
    
    # List to hold futures for concurrent transcription jobs
    transcription_futures = []
    
    # Use a ThreadPoolExecutor for concurrent transcription
    with ThreadPoolExecutor(max_workers=4) as executor:
        # Capture primary audio segment and submit for transcription immediately
        primary_audio = record_audio(timeout=None)
        if primary_audio is not None:
            filtered_primary = filter_audio_by_rms(primary_audio)
            if filtered_primary.size != 0:
                transcription_futures.append(executor.submit(transcribe_segment, filtered_primary))
            else:
                print("No frames passed the RMS filter for the primary segment; skipping.")
        else:
            print("No primary audio captured.")
        
        # Immediately start listening for additional speech
        additional_audio = record_audio(timeout=3)
        if additional_audio is not None:
            filtered_additional = filter_audio_by_rms(additional_audio)
            if filtered_additional.size != 0:
                transcription_futures.append(executor.submit(transcribe_segment, filtered_additional))
            else:
                print("No frames passed the RMS filter for the additional segment; ending session.")
        else:
            print("🛑 Silence detected for 3 seconds. Ending transcription session.")
    
        # At this point, additional segments could be added similarly if desired.
    
        # Wait for all transcription jobs to complete
        transcribed_segments = []
        for future in transcription_futures:
            transcription = future.result()
            if transcription.strip():
                transcribed_segments.append(transcription)
            else:
                print("A segment produced a blank transcription; skipping.")
    
    # Aggregate all non-blank transcriptions into one paragraph
    aggregated_text = " ".join(seg for seg in transcribed_segments if seg.strip())
    if aggregated_text:
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
        with csv_lock:
            with open(csv_path, "a", newline="") as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow([timestamp, aggregated_text, False])
        print("📝 Aggregated Transcription:", aggregated_text, "\n")
    else:
        print("No valid transcriptions captured in this session.")

########################################
# Phase 2: LLM Response Generation (with Conciseness Step)
########################################

def run_llm_response_generation():
    """
    Reads the CSV, processes transcriptions in two steps:
      1. Runs the aggregated transcription through an LLM to make it concise while preserving its meaning.
      2. Sends the concise transcription to the assistant prompt LLM to generate a final response.
    The CSV is then updated with the assistant responses.
    """
    gemma2b_local_api = "http://192.168.0.193:1234/v1/chat/completions"
    
    def make_concise(transcription):
        messages = [
            {"role": "system", "content": "You are an expert summarizer. Make the following transcription concise while preserving its meaning."},
            {"role": "user", "content": transcription}
        ]
        payload = {
            "model": "gemma-2-2b-it",
            "messages": messages,
            "temperature": 0.7,
            "max_tokens": 300,
            "stream": False
        }
        try:
            response = requests.post(gemma2b_local_api, headers={"Content-Type": "application/json"}, json=payload)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Error generating concise transcription for: '{transcription}': {e}")
            return transcription  # Fallback to original if error occurs
        data = response.json()
        try:
            return data["choices"][0]["message"]["content"].strip()
        except (KeyError, IndexError):
            return transcription

    def generate_assistant_response(transcription):
        messages = [
            {"role": "system", "content": (
                "You are Satya, a personal assistant. Provide a concise, helpful, and friendly response. "
                "Do not include any emojis in the output. There should be no symbols or special characters in your output."
            )},
            {"role": "user", "content": transcription}
        ]
        payload = {
            "model": "gemma-2-2b-it",
            "messages": messages,
            "temperature": 0.7,
            "max_tokens": 500,
            "stream": False
        }
        try:
            response = requests.post(gemma2b_local_api, headers={"Content-Type": "application/json"}, json=payload)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Error generating assistant response for transcription: '{transcription}': {e}")
            return "Error generating response."
        data = response.json()
        try:
            return data["choices"][0]["message"]["content"].strip()
        except (KeyError, IndexError):
            return "No response generated."

    def process_transcription(transcription):
        concise = make_concise(transcription)
        return generate_assistant_response(concise)

    df = pd.read_csv(csv_path)
    df = df.dropna(subset=["Transcription"])
    if "AssistantResponse" in df.columns:
        df_to_process = df[df["AssistantResponse"].isnull() | (df["AssistantResponse"] == "")]
    else:
        df_to_process = df
    with ThreadPoolExecutor() as executor:
        responses = list(executor.map(process_transcription, df_to_process["Transcription"]))
    df.loc[df_to_process.index, "AssistantResponse"] = responses
    df.to_csv(csv_path, index=False)
    print("LLM Response Generation: Completed and CSV updated.")

########################################
# Phase 3: TTS Playback
########################################

tts_playback = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False)
vad_playback = webrtcvad.Vad()
vad_playback.set_mode(2)
global_stop = False
monitor_stop_event = threading.Event()

def monitor_continuous(amp_threshold=2100, required_consecutive=5):
    global global_stop
    speech_counter = 0
    with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype="int16") as stream:
        while not monitor_stop_event.is_set():
            frame, _ = stream.read(FRAME_SIZE)
            mic_frame = frame[:, 0]
            amplitude = np.abs(mic_frame).mean()
            if amplitude > amp_threshold and vad_playback.is_speech(mic_frame.tobytes(), SAMPLE_RATE):
                speech_counter += 1
                if speech_counter >= required_consecutive:
                    print("Continuous user speech detected. Interrupting TTS phase.")
                    global_stop = True
                    monitor_stop_event.set()
                    break
            else:
                speech_counter = 0
            time.sleep(0.01)

def monitor_mic(play_obj, amp_threshold=2100, required_consecutive=5):
    global global_stop
    speech_counter = 0
    with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype="int16") as stream:
        while play_obj.is_playing():
            frame, _ = stream.read(FRAME_SIZE)
            mic_frame = frame[:, 0]
            amplitude = np.abs(mic_frame).mean()
            if amplitude > amp_threshold and vad_playback.is_speech(mic_frame.tobytes(), SAMPLE_RATE):
                speech_counter += 1
                if speech_counter >= required_consecutive:
                    print("User speech detected during clip playback. Stopping playback.")
                    play_obj.stop()
                    global_stop = True
                    break
            else:
                speech_counter = 0
            time.sleep(0.01)

def run_tts_playback():
    global global_stop
    monitor_stop_event.clear()
    continuous_monitor_thread = threading.Thread(target=monitor_continuous)
    continuous_monitor_thread.start()
    
    df = pd.read_csv(csv_path)
    df = df.dropna(subset=["AssistantResponse"])
    if "Played" not in df.columns:
        df["Played"] = False
        
    for idx, row in df.iterrows():
        if global_stop:
            print("User speech detected; marking remaining rows as played.")
            df.loc[idx:, "Played"] = True
            df.to_csv(csv_path, index=False)
            break
        if row["Played"]:
            print(f"Skipping row {idx} as it has already been played.")
            continue
        text = clean_text(str(row["AssistantResponse"]).strip())
        if not text:
            print(f"Skipping row {idx} due to empty response after cleaning")
            continue
        print(f"Speaking for row {idx}: {text}")
        try:
            audio_buffer = io.BytesIO()
            tts_playback.tts_to_file(text=text, file_path=audio_buffer)
            audio_buffer.seek(0)
            data, samplerate = sf.read(audio_buffer, dtype='int16')
            audio_array = np.array(data, dtype=np.int16).tobytes()
            wave_obj = sa.WaveObject(audio_array, num_channels=1, bytes_per_sample=2, sample_rate=samplerate)
            play_obj = wave_obj.play()
            monitor_thread = threading.Thread(target=monitor_mic, args=(play_obj,))
            monitor_thread.start()
            play_obj.wait_done()
            monitor_thread.join()
            if global_stop:
                print("User speech detected during playback; marking remaining rows as played.")
                df.loc[idx:, "Played"] = True
                df.to_csv(csv_path, index=False)
                break
            df.at[idx, "Played"] = True
            df.to_csv(csv_path, index=False)
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
    df.to_csv(csv_path, index=False)
    monitor_stop_event.set()
    continuous_monitor_thread.join()

########################################
# Main Execution Pipeline (Continuous Loop)
########################################

if __name__ == '__main__':
    try:
        while True:
            print("\n--- Starting Transcription Phase ---")
            transcribe_audio(max_duration=60)
            
            print("\n--- Running LLM Response Generation Phase ---")
            run_llm_response_generation()
            
            global_stop = False
            print("\n--- Running TTS Playback Phase ---")
            run_tts_playback()
            
            print("\n--- Cycle complete. Restarting in 1 second... ---\n")
            time.sleep(1)
    except KeyboardInterrupt:
        print("\n🛑 Transcription interrupted by user.")
        transcription_queue.put(None)
        worker_thread.join()
