In [None]:
import pytesseract
import speech_recognition as sr
import pyttsx3
import language_tool_python
from deep_translator import GoogleTranslator
from langdetect import detect
from PIL import Image
import os
import logging
import cv2
import numpy as np
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
import cohere
# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Initialize Cohere AI (Replace with your API Key)
COHERE_API_KEY = "BBOt2HXFWB0r6mhkkEJpSIyG9wGt3LWz4HZdLzlo"  # Replace with your actual API key
co = cohere.Client(COHERE_API_KEY)

# Initialize AI-based image captioning model
device = "cuda" if torch.cuda.is_available() else "cpu"
logging.info(f"Using device: {device}")

try:
    blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
    logging.info("BLIP model loaded successfully.")
except Exception as e:
    logging.error(f"Error loading BLIP model: {e}")
    blip_processor, blip_model = None, None

# Initialize grammar checker
try:
    tool = language_tool_python.LanguageTool("en-US")
    logging.info("Grammar checker initialized.")
except Exception as e:
    logging.error(f"Error initializing LanguageTool: {e}")
    tool = None

# Global variables for speech
is_speaking = False

# ✅ Multilingual Text-to-Speech (TTS)
def speak(text, lang="en"):
    """Convert text to speech in the detected language."""
    try:
        # Create a new engine instance for each call
        local_engine = pyttsx3.init()
        local_engine.setProperty("rate", 150)
        local_engine.setProperty("volume", 1.0)
        
        # Translate text if needed
        if lang not in ["en", "ta", "te"]:
            text = GoogleTranslator(source="auto", target=lang).translate(text)

        if lang == "ta":
            local_engine.setProperty("voice", "tam")  # Tamil voice (if available)
        elif lang == "te":
            local_engine.setProperty("voice", "tel")  # Telugu voice (if available)

        local_engine.say(text)
        local_engine.runAndWait()
        local_engine.stop()  # Stop the engine after speaking
    except Exception as e:
        logging.error(f"Speech error: {e}")

# ✅ Multilingual Speech Recognition
def listen():
    """Capture user input via microphone, detect language, and convert to text."""
    recognizer = sr.Recognizer()
    recognizer.energy_threshold = 300  

    with sr.Microphone() as source:
        print("Listening...")
        recognizer.adjust_for_ambient_noise(source, duration=1)

        try:
            audio = recognizer.listen(source, timeout=5)
            text = recognizer.recognize_google(audio, language="en-IN")  # Supports Indian languages
            detected_language = detect(text)  # Detect language
            print(f"User said ({detected_language}): {text}")
            return text.lower(), detected_language
        except sr.UnknownValueError:
            print("I couldn't understand that.")
            return None, "en"
        except sr.RequestError:
            print("Speech recognition service is unavailable.")
            return None, "en"
        except Exception as e:
            logging.error(f"Error in voice recognition: {e}")
            return None, "en"

# ✅ AI-Powered Response Generation
def generate_ai_response(prompt, lang="en"):
    """Generate AI-powered responses using Cohere and translate them."""
    try:
        response = co.generate(
            model="command",
            prompt=prompt,
            max_tokens=100
        )
        ai_text = response.generations[0].text.strip()
        
        # Translate response if needed
        if lang in ["ta", "te"]:
            ai_text = GoogleTranslator(source="auto", target=lang).translate(ai_text)

        return ai_text
    except Exception as e:
        logging.error(f"Error generating AI response: {e}")
        return "I'm sorry, I couldn't generate a response."

# ✅ OCR for Text Extraction
def extract_text(image_path):
    """Extract text from an image using OCR."""
    try:
        if not os.path.exists(image_path):
            return "Image not found.", "en"
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image).strip()
        detected_language = detect(text) if text else "en"
        logging.info(f"Extracted text ({detected_language}): {text}")
        return text if text else "No text detected.", detected_language
    except Exception as e:
        logging.error(f"Error extracting text: {e}")
        return "Failed to extract text.", "en"
# ✅ AI Image Captioning
def generate_detailed_caption(image_path):
    """Generate detailed captions using BLIP and GPT."""
    if not blip_processor or not blip_model:
        return "Image captioning model is unavailable.", "en"

    try:
        if not os.path.exists(image_path):
            return "Image not found.", "en"
        image = Image.open(image_path).convert("RGB")
        inputs = blip_processor(image, return_tensors="pt").to(device)
        caption = blip_model.generate(**inputs)
        caption_text = blip_processor.batch_decode(caption, skip_special_tokens=True)[0]

        detected_language = detect(caption_text)
        logging.info(f"Generated caption ({detected_language}): {caption_text}")

        return caption_text, detected_language
    except Exception as e:
        logging.error(f"Error generating detailed caption: {e}")
        return "Failed to generate detailed image caption.", "en"

# ✅ Chatbot Logic
def chatbot(image_path=None):
    """Chatbot that processes images and engages in a conversation."""
    image_context = {
        "caption": None,
        "caption_lang": "en",
        "extracted_text": None,
        "text_lang": "en"
    }

    if image_path:
        print("\n🔍 Processing Image...")
        image_context["caption"], image_context["caption_lang"] = generate_detailed_caption(image_path)
        image_context["extracted_text"], image_context["text_lang"] = extract_text(image_path)
        print(f"\n🖼 Caption: {image_context['caption']} ({image_context['caption_lang']})")
        print(f"\n📜 Extracted Text: {image_context['extracted_text']} ({image_context['text_lang']})")

    while True:
        user_input, user_lang = listen()
        if user_input == "exit":
            print("Goodbye!")
            speak("Goodbye!", user_lang)
            break

        if image_context["caption"]:
            prompt = f"Based on the image description: {image_context['caption']}, answer: {user_input}"
            response = generate_ai_response(prompt, user_lang)
            print(f"\n🤖 AI ({user_lang}): {response}")
            speak(response, user_lang)
        else:
            response = generate_ai_response(user_input, user_lang)
            print(f"\n🤖 AI ({user_lang}): {response}")
            speak(response, user_lang)

if __name__ == "__main__":
    image_path = input("Enter the image path (or leave empty to skip image analysis): ").strip()
    chatbot(image_path if image_path else None)