In [None]:
# ==========================================================
# Visual → Text → Google Voice (Jetson CSI-only)
# ==========================================================
# - Captures one JPEG from CSI (nvarguscamerasrc)
# - Describes with OpenAI Vision
# - Speaks with Google Cloud Text-to-Speech (MP3)
# - Plays audio if aplayer/ffplay/mpg123 present; else shows widget
# ==========================================================


import os, time, base64, subprocess, shutil, glob
from datetime import datetime
os.environ["OPENAI_API_KEY"] = "Place your Key"
os.environ.setdefault("GOOGLE_APPLICATION_CREDENTIALS", "/home/tin/gcloud/tts-sa.json")
import shlex  # ✅ Add this import near the top
# ---------- Dependencies ----------
# If needed, install Google TTS client (uncomment):
# !pip install --quiet google-cloud-texttospeech

try:
    import cv2
except Exception as e:
    raise RuntimeError("OpenCV is required. Install python3-opencv.") from e

try:
    from openai import OpenAI
except Exception as e:
    raise RuntimeError("Install openai: pip install --quiet openai") from e

try:
    from google.cloud import texttospeech as gtts
except Exception as e:
    raise RuntimeError("Install google-cloud-texttospeech and set GOOGLE_APPLICATION_CREDENTIALS.") from e

from IPython.display import Audio, display

# ---------- API Keys / Auth ----------
if not os.environ.get("OPENAI_API_KEY"):
    raise RuntimeError("OPENAI_API_KEY not set. Do:  import os; os.environ['OPENAI_API_KEY']='sk-proj-…'")
if not os.environ.get("GOOGLE_APPLICATION_CREDENTIALS"):
    raise RuntimeError("GOOGLE_APPLICATION_CREDENTIALS not set. Point it to your Google JSON key file.")

openai_client = OpenAI()  # uses env key for OpenAI

# ---------- Vision / TTS Config ----------
MODEL        = "gpt-4o-mini"
PROMPT       = "Describe the scene succinctly. Note people, objects, text, and any safety issues." # Note people, objects, text, and any safety issues.
TEMPERATURE  = 0.2

VOICE_LANG   = "en-US"
VOICE_NAME   = "en-US-Standard-F"   # try Neural2/Standard/WaveNet variants, e.g., en-US-Neural2-F
AUDIO_FMT    = gtts.AudioEncoding.MP3
SPEAKING_RATE= 1.0
PITCH        = 0.0
VOLUME_GAIN  = 0.0

# ---------- CSI Capture Preferences ----------
SENSOR_IDS   = [0, 1]     # try both sockets
FLIP_METHODS = [6, 0]     # 180° then none
MODES = [
    {"w":1920, "h":1080, "fps":"30/1"},
    {"w":1920, "h":1080, "fps":"15/1"},
    {"w":1280, "h":720,  "fps":"30/1"},
]
JPEG_QUALITY = 85
MAX_SIDE     = 1024
GST_TIMEOUT  = 20
QUIET        = True

# ---------- Helpers ----------
def _run(cmd, timeout=GST_TIMEOUT):
    p = subprocess.run(["bash","-lc",cmd], text=True, capture_output=True, timeout=timeout)
    return p.returncode, p.stdout.strip(), p.stderr.strip()

def _has_gst():
    return shutil.which("gst-launch-1.0") is not None

def _resize_longest(img_bgr, max_side=MAX_SIDE):
    h, w = img_bgr.shape[:2]
    longest = max(h, w)
    if longest <= max_side:
        return img_bgr
    scale = max_side / float(longest)
    return cv2.resize(img_bgr, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA)

def csi_snapshot():
    if not _has_gst():
        raise RuntimeError("gst-launch-1.0 not found in PATH.")
    tried = []
    for sid in SENSOR_IDS:
        for flip in FLIP_METHODS:
            for m in MODES:
                out_path = f"/tmp/csi_{sid}_{m['w']}x{m['h']}_{m['fps'].replace('/','-')}_f{flip}_{int(time.time())}.jpg"
                qflag = "-q" if QUIET else "-v"
                cmd = (
                    f"gst-launch-1.0 {qflag} "
                    f"nvarguscamerasrc sensor-id={sid} num-buffers=1 ! "
                    f"'video/x-raw(memory:NVMM),width={m['w']},height={m['h']},framerate={m['fps']}' ! "
                    f"nvvidconv flip-method={flip} ! "
                    f"nvjpegenc quality={JPEG_QUALITY} ! "
                    f"filesink location={out_path} sync=false -e"
                )
                try:
                    rc, out, err = _run(cmd, timeout=GST_TIMEOUT)
                except subprocess.TimeoutExpired:
                    tried.append(f"sid={sid}, {m['w']}x{m['h']}@{m['fps']}, flip={flip}: TIMEOUT({GST_TIMEOUT}s)")
                    continue
                ok = (rc == 0 and os.path.exists(out_path) and os.path.getsize(out_path) > 0)
                if ok:
                    return out_path, f"sensor-id={sid}, {m['w']}x{m['h']}@{m['fps']}, flip={flip}"
                tried.append(f"sid={sid}, {m['w']}x{m['h']}, {m['fps']}, flip={flip}: rc={rc}, err='{(err or out)[:200]}'")
    raise RuntimeError("All CSI attempts failed:\n- " + "\n- ".join(tried))

def describe_image_bgr(img_bgr):
    img_bgr = _resize_longest(img_bgr, MAX_SIDE)
    ok, buf = cv2.imencode(".jpg", img_bgr, [int(cv2.IMWRITE_JPEG_QUALITY), JPEG_QUALITY])
    if not ok:
        raise RuntimeError("JPEG encode failed.")
    b64 = base64.b64encode(buf.tobytes()).decode("utf-8")

    # image_url must be an object with {"url": "..."}
    resp = openai_client.chat.completions.create(
        model=MODEL,
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": PROMPT},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
            ],
        }],
        temperature=TEMPERATURE,
    )
    return resp.choices[0].message.content.strip()

def synth_gcloud_tts(text, out_mp3="speech_output.mp3"):
    client = gtts.TextToSpeechClient()
    input_text = gtts.SynthesisInput(text=text)

    # Choose voice
    voice = gtts.VoiceSelectionParams(
        language_code=VOICE_LANG,
        name=VOICE_NAME,  # set None to let Google pick best voice for language
    )
    # Audio config
    audio_config = gtts.AudioConfig(
        audio_encoding=AUDIO_FMT,
        speaking_rate=SPEAKING_RATE,
        pitch=PITCH,
        volume_gain_db=VOLUME_GAIN,
    )

    response = client.synthesize_speech(
        input=input_text, voice=voice, audio_config=audio_config
    )

    with open(out_mp3, "wb") as f:
        f.write(response.audio_content)
    return out_mp3

from IPython.display import Audio, display

def try_play(filepath):
    """Prefer the Jupyter audio widget; fall back to CLI players if available."""
    # 1) Jupyter widget (non-blocking, no timeout)
    try:
        display(Audio(filename=filepath, autoplay=True))
        return True, "Displayed notebook audio widget."
    except Exception:
        pass

    # 2) CLI fallbacks without a small timeout (let the player finish)
    import shlex, shutil, subprocess

    players = [
        ("mpg123", f"mpg123 -q {shlex.quote(filepath)}"),
        ("ffplay", f"ffplay -autoexit -nodisp -loglevel error {shlex.quote(filepath)}"),
        ("play",   f"play -q {shlex.quote(filepath)}"),  # sox
        ("paplay", f"paplay {shlex.quote(filepath)}"),
        ("aplay",  f"aplay {shlex.quote(filepath)}"),    # works natively for WAV, may need plugin for MP3
    ]
    for exe, cmd in players:
        if shutil.which(exe):
            # Run without timeout so it can finish naturally
            rc = subprocess.call(["bash","-lc",cmd])
            if rc == 0:
                return True, f"Played with {exe}"
    return False, "No audio player found and widget failed."


# ---------- Main ----------
print("=== CSI → OpenAI → Google TTS ===")
print("gst-launch-1.0:", "OK" if _has_gst() else "NOT FOUND")

img_path, meta = csi_snapshot()
print(f"CSI snapshot OK → {img_path}  ({meta})")

img = cv2.imread(img_path)
if img is None:
    raise RuntimeError(f"OpenCV failed to read: {img_path}")

# Save a copy next to notebook, too
save_copy = f"capture_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jpg"
cv2.imwrite(save_copy, img, [int(cv2.IMWRITE_JPEG_QUALITY), JPEG_QUALITY])
print("Saved copy:", save_copy)

print("Describing scene with OpenAI…")
text = describe_image_bgr(img)
print("\n--- Scene Description ---\n", text)

print("\nSynthesizing with Google Cloud TTS…")
mp3_path = f"speech_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
mp3_path = synth_gcloud_tts(text, out_mp3=mp3_path)
print("Audio saved:", mp3_path)

ok, msg = try_play(mp3_path)
print("Playback:", msg)


=== CSI → OpenAI → Google TTS ===
gst-launch-1.0: OK
CSI snapshot OK → /tmp/csi_0_1920x1080_30-1_f6_1762338083.jpg  (sensor-id=0, 1920x1080@30/1, flip=6)
Saved copy: capture_20251105_052125.jpg
Describing scene with OpenAI…

--- Scene Description ---
 The scene depicts a cluttered room with wooden flooring. There are no visible people. Objects include a large black bag, several cardboard boxes, a trash can, and a rolled-up item on the floor. In the background, there are office chairs, a desk, and storage units. The lighting is dim, creating a somewhat disorganized atmosphere. Safety issues may include tripping hazards from the scattered items on the floor.

Synthesizing with Google Cloud TTS…
Audio saved: speech_20251105_052128.mp3


Playback: Displayed notebook audio widget.
