In [None]:
!pip install groq

In [1]:
from groq import Groq
import base64
from IPython.display import Image
import os

In [2]:
from google.colab import userdata
os.environ["GROQ_API_KEY"] = userdata.get('groq')

In [3]:
def encode_image(image_path):
    image_file=open(image_path, "rb")
    return base64.b64encode(image_file.read()).decode('utf-8')

In [4]:
query="What sort of eye disease is this?"

In [5]:
model="meta-llama/llama-4-scout-17b-16e-instruct"

In [6]:
def analyze_image_with_query(query, model, encoded_image):
    client = Groq(api_key=os.environ["GROQ_API_KEY"])

    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": query
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{encoded_image}",
                    },
                },
            ],
        }
    ]

    response = client.chat.completions.create(
        messages=messages,
        model=model
    )
    return response.choices[0].message.content

In [7]:
encoded_image = encode_image("/content/drive/MyDrive/eye_diseases/dataset/cataract/1144_left.jpg")

In [8]:
result = analyze_image_with_query(query, model, encoded_image)
print(result)

The image appears to show a fundus photograph of the retina, which is the inner lining at the back of the eye. There are several abnormalities visible in this image, including:

*   **Cotton wool spots:** These are small, white or grayish patches on the retina that are indicative of localized areas of retinal ischemia or nerve fiber layer infarcts. They appear as fluffy white spots.
*   **Flame-shaped hemorrhages:** These are small, irregularly shaped areas of bleeding in the retina.

Based on these findings, the eye disease depicted in the image could be **retinopathy**, possibly **diabetic retinopathy** or **hypertensive retinopathy**. However, without more information about the patient's medical history and other diagnostic test results, it is difficult to provide a definitive diagnosis.

A definitive diagnosis can only be made by a qualified medical professional, such as an ophthalmologist, after a comprehensive eye examination and review of the patient's medical history.


In [10]:
# Required for audio processing and transcription
!pip install -q pydub
!pip install -q speechrecognition
!pip install -q groq
!apt-get install -y ffmpeg  # Enables audio decoding for many formats

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [9]:
from google.colab import files

uploaded = files.upload()
audio_filepath = list(uploaded.keys())[0]
print(f"Uploaded audio file: {audio_filepath}")

Saving chatbot_testing.mp3 to chatbot_testing (2).mp3
Uploaded audio file: chatbot_testing (2).mp3


In [10]:
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
stt_model = "whisper-large-v3"

In [11]:
def transcribe_with_groq(stt_model, audio_filepath, GROQ_API_KEY):
    client = Groq(api_key=GROQ_API_KEY)

    with open(audio_filepath, "rb") as audio_file:
        transcription = client.audio.transcriptions.create(
            model=stt_model,
            file=audio_file,
            language="en"
        )

    return transcription.text

In [12]:
transcribed_text = transcribe_with_groq(stt_model, audio_filepath, GROQ_API_KEY)
print("📝 Transcribed Text:", transcribed_text)

📝 Transcribed Text:  Hello Doctor, what's wrong with my hand?


In [18]:
!pip install -q gTTS
!pip install -q elevenlabs

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m754.8/754.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [13]:
from gtts import gTTS
from IPython.display import Audio

def text_to_speech_with_gtts(input_text, output_filepath="gTTS_output.mp3"):
    tts = gTTS(text=input_text, lang='en', slow=False)
    tts.save(output_filepath)
    print(f"🔊 gTTS audio saved to {output_filepath}")
    return Audio(output_filepath)

In [14]:
input_text = "Hi! I'm Saihaj, a future Data Scientist speaking."

# Using gTTS
audio_gtts = text_to_speech_with_gtts(input_text)
display(audio_gtts)

🔊 gTTS audio saved to gTTS_output.mp3


In [15]:
import os
from elevenlabs.client import ElevenLabs
import elevenlabs

# Make sure you have the API key
ELEVENLABS_API_KEY = os.environ.get("ELEVEN_API_KEY")

def text_to_speech_with_elevenlabs(input_text, output_filepath="elevenlabs_output.mp3"):
    if ELEVENLABS_API_KEY is None:
        raise ValueError("Please set ELEVEN_API_KEY in your environment.")

    client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
    audio = client.generate(
        text=input_text,
        voice="Aria",
        output_format="mp3_22050_32",
        model="eleven_turbo_v2"
    )
    elevenlabs.save(audio, output_filepath)
    print(f"🔊 ElevenLabs audio saved to {output_filepath}")
    return Audio(output_filepath)

In [16]:
# Using ElevenLabs
# audio_eleven = text_to_speech_with_elevenlabs(input_text)
# display(audio_eleven)

In [None]:
!pip uninstall -y gradio click
!pip install gradio==4.26.0 click==8.1.7

In [17]:
import gradio as gr
import os
from IPython.display import Audio

In [18]:
system_prompt="""You have to act as a professional doctor, i know you are not but this is for learning purpose.
            What's in this image?. Do you find anything wrong with it medically?
            If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
            your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
            Donot say 'In the image I see' but say 'With what I see, I think you have ....'
            Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
            Keep your answer concise (max 2 sentences). No preamble, start your answer right away please"""

In [26]:
# # 🧩 The main function connecting everything
# def process_inputs(audio_filepath, image_filepath):
#     # Step 1: Transcribe the audio
#     transcribed_text = transcribe_with_groq(
#         stt_model="whisper-large-v3",
#         audio_filepath=audio_filepath,
#         GROQ_API_KEY=os.environ.get("GROQ_API_KEY")
#     )

#     # Step 2: Generate doctor-style response using Groq Vision LLM
#     if image_filepath:
#         query = system_prompt + " " + transcribed_text
#         encoded_img = encode_image(image_filepath)
#         doctor_response = analyze_image_with_query(
#             query=query,
#             model="meta-llama/llama-4-scout-17b-16e-instruct",
#             encoded_image=encoded_img
#         )
#     else:
#         doctor_response = "No image provided for me to analyze."

#     # Step 3: Convert doctor response to speech using gTTS
#     audio_output_path = "final_gtts.mp3"
#     tts_audio = text_to_speech_with_gtts(
#         input_text=doctor_response,
#         output_filepath=audio_output_path
#     )

#     return transcribed_text, doctor_response, tts_audio



In [19]:
def process_inputs(audio_filepath, image_filepath):
    try:
        # Step 1: Transcribe speech
        speech_to_text_output = transcribe_with_groq(
            GROQ_API_KEY=os.environ.get("GROQ_API_KEY"),
            audio_filepath=audio_filepath,
            stt_model="whisper-large-v3"
        )

        # Step 2: Handle image + query to LLaMA Vision
        if image_filepath:
            doctor_response = analyze_image_with_query(
                query=system_prompt + " " + speech_to_text_output,
                encoded_image=encode_image(image_filepath),
                model="meta-llama/llama-4-scout-17b-16e-instruct"
            )
        else:
            doctor_response = "No image provided for me to analyze"

        # Step 3: Convert doctor's text to speech using gTTS
        voice_output_path = "final_response.mp3"
        text_to_speech_with_gtts(
            input_text=doctor_response,
            output_filepath=voice_output_path
        )

        return speech_to_text_output, doctor_response, voice_output_path

    except Exception as e:
        print("❌ Error in process_inputs:", e)
        return "Error in transcription", "Error in image analysis or TTS", None


In [25]:
# 🎛️ Launch Gradio interface
iface = gr.Interface(
    fn=process_inputs,
    inputs=[
        gr.Audio(sources=["microphone", "upload"], type="filepath", label="🎤 Speak or Upload Audio"),
        gr.Image(type="filepath", label="🖼️ Upload image of condition")
    ],
    outputs=[
        gr.Textbox(label="📝 Transcribed Patient Voice"),
        gr.Textbox(label="👨‍⚕️ Doctor's Response"),
        gr.Audio(label="🔊 Doctor's Voice Reply")
    ],
    title="🩺 AI Doctor Chatbot !!!!",
    description="Speak your symptoms and upload a medical image. The AI Doctor will analyze and respond with voice.",
)

iface.launch(debug=False)


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://326dfbdc8ea4d2b01d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [33]:
from google.colab import files
from groq import Groq
import os

# 1. Upload the audio file
uploaded = files.upload()
audio_path = list(uploaded.keys())[0]  # Get uploaded filename

# 2. Initialize Groq client
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

# 3. Transcribe using Whisper from Groq
with open(audio_path, "rb") as f:
    transcription = client.audio.transcriptions.create(
        model="whisper-large-v3",
        file=f,
        language="en"
    )

print(" Transcription result:", transcription.text)


Saving chatbot_testing.mp3 to chatbot_testing (1).mp3
 Transcription result:  Hello Doctor, what's wrong with my hand?


In [None]:
!pip install -U gradio