In [None]:
#IMPORT FILES
import google.generativeai as genai
from PIL import Image
import cv2
import time
import os
import threading
import uuid
from gtts import gTTS
from flask import Flask, jsonify, request

In [None]:
# --- CONFIGURATION ---
app = Flask(__name__)

# PASTE YOUR API KEY HERE
api_key = "Won't give u my key" #Add your own key here 
genai.configure(api_key=api_key)

#Model to be used
model = genai.GenerativeModel('gemini-2.5-flash')

In [None]:
# GLOBAL VARIABLES
current_mode = "object_detection"
latest_request_time = 0 # Stores the time of the most recent button press

In [None]:
def stop_current_audio():
     """Instantly kills any running audio (mpg321) processes"""
     try:
       os.system("pkill mpg321")
     except:
       pass



In [None]:
def speak_text_google(text, request_time):
    """Speaks text ONLY if this is still the latest request"""
    global latest_request_time
    if request_time != latest_request_time:
        print(" -> Old request. Speech cancelled.")
        return

    # Cleaning text
    clean_text = text.replace("*", "").replace("#", "")
    print(f"Speaking: {clean_text}")
    try:
        filename = f"speech_{uuid.uuid4()}.mp3"
        tts = gTTS(text=clean_text, lang='en', slow=False)
        tts.save(filename)

        if request_time != latest_request_time:
            if os.path.exists(filename): os.remove(filename)
            return

        # Play audio
        os.system(f"mpg321 {filename} -q") 

        if os.path.exists(filename):
            os.remove(filename)

    except Exception as e:
        print(f"Audio Error: {e}")

In [None]:
def run_vision_task(request_time):
    """Runs AI ONLY if this is the latest request"""
    global current_mode, latest_request_time

    # CHECK 1: Start
    if request_time != latest_request_time: return "Cancelled"

    cap = cv2.VideoCapture(0)
    if not cap.isOpened(): return "Camera error."

    for _ in range(2): cap.read()
    ret, frame = cap.read()
    cap.release()
    
    if not ret: return "Capture failed."

    # CHECK 2: After Camera 
    if request_time != latest_request_time: return "Cancelled"

    frame_resized = cv2.resize(frame, (640, 480)) 
    rgb_frame = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(rgb_frame)

    # --- PROMPTS ---
    if current_mode == "currency_detection":
        prompt = """
        Analyze the currency in this image.
        RULES:
        1. Identify the value of every note visible.
        2. If multiple notes, list them and calculate the Total Sum.
        3. OUTPUT FORMAT: Just the numbers and currency. No sentences.

        Example Single: "500 Rupees"
        Example Multiple: "500 Rupees, 100 Rupees. Total 600 Rupees"
        """

    elif current_mode == "text_read":
        prompt = """
        Read the text in the image. 
        RULES:
        1. Read exactly what is written.
        2. Do not say "The text says". Just read it.
        3. If it is a long paragraph, summarize it in 15 words max.
        """

    elif current_mode == "scene_summary":
        prompt = """
        Describe the scene for a blind person.
        RULES:
        1. Max 15 words.
        2. Focus on obstacles or main activity.
        3. Tell in one line, what is in front, right and left.
        4. Be direct. (e.g. "A messy desk with a laptop and coffee mug.")
        """

    elif current_mode == "face_recognition":
        prompt = """
        Identify people.
        RULES:
        1. Format: [Name], [Gender], [Approx Age], [Emotion],[NUmber of people].
        2. Example: "Male, 20s, Happy. Female, 40s, Neutral."
        3. No filler words.
        """

    elif current_mode == "multilingual":
        prompt = """
        Detect text.
        RULES:
        1. If English, read it.
        2. If not English, translate to English and read ONLY the translation.
        3. If it is a long paragraph, summarize it in 20 words max in English
        """

    else:
      # Default Object Prompt
        prompt = """Look at this image. Identify the main object. 
        I need precise details:
        1. The Brand Name (e.g., Tropicana, Apple)
        2. The Product Type (e.g., Orange Juice, iPhone 15)
        3. Any specific labels, flavors, or variants visible.
        4. NO full big sentences.

        Return the answer in a single, clear sentence like:
        'This is [Brand] [Product] [Variant]."""
 
    print(f"Analyzing in {current_mode} mode...")

    try:
        # CHECK 3: Before API Call (Expensive operation)
        if request_time != latest_request_time: return "Cancelled"

        response = model.generate_content([prompt, pil_image])
        result_text = result_text.replace("The value is", "").replace("The total is", "")

        # CHECK 4: Before Speaking
        if request_time != latest_request_time: return "Cancelled"

        speak_text_google(result_text, request_time)
        return result_text

    except Exception as e:
        print(f"API Error: {e}")
        return f"API Error: {e}"

In [None]:
# --- ROUTES ---

@app.route('/scan', methods=['GET'])
def scan_trigger():
    # Treat manual scan as a new unique request
    global latest_request_time
    stop_current_audio()

    this_req_time = time.time()
    latest_request_time = this_req_time

    result = run_vision_task(this_req_time)
    return jsonify({"status": "success", "message": result})

@app.route('/mode', methods=['POST'])
def switch_mode():
    global current_mode, latest_request_time

    # 1. STOP EVERYTHING OLD
    stop_current_audio() # Kill previous voice immediately

    # 2. GENERATE NEW ID
    this_req_time = time.time()
    latest_request_time = this_req_time

    try:
        data = request.get_json()
        current_mode = data.get("mode", "object_detection")

        # 3. Speak Mode Name (Background Thread)
        clean_name = current_mode.replace("_", " ").replace("detection", "")
        threading.Thread(target=speak_text_google, args=(f"{clean_name} mode", this_req_time)).start()

        # 4. Run Vision Task (It will check 'this_req_time' constantly)
        scan_result = run_vision_task(this_req_time)

        return jsonify({"status": "success", "mode": current_mode, "result": scan_result})
    except Exception as e:
        return jsonify({"status": "error", "message": str(e)}), 500

if __name__ == "__main__":
    app.run(host='0.0.0.0', port=8000, debug=True)

