In [1]:
#!/usr/bin/env python3
"""
Enhanced Groq + Gemini Chatbot with Modern UI using Gradio
Supports PDF, Word, Text, CSV, JSON, and Image files (Arabic OCR + scanned PDFs)
Author: AI Assistant
Version: 6.2
"""

import gradio as gr 
import requests
import json
import os
import csv
from pathlib import Path
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\\Program Files\\Tesseract-OCR\\tesseract.exe"
from PIL import Image

try:
    import PyPDF2
    PDF_SUPPORT = True
except ImportError:
    PDF_SUPPORT = False

try:
    import docx
    DOCX_SUPPORT = True
except ImportError:
    DOCX_SUPPORT = False

try:
    from pdf2image import convert_from_path
    PDF2IMG_SUPPORT = True
except ImportError:
    PDF2IMG_SUPPORT = False

GROQ_MODELS = [
    "llama-3.1-8b-instant",
    "llama-3.1-70b-versatile",
    "llama-3.3-70b-versatile",
    "mixtral-8x7b-32768",
    "gemma2-9b-it",
    "llama3-groq-70b-8192-tool-use-preview",
    "llama3-groq-8b-8192-tool-use-preview",
    "gemini-pro"
]

uploaded_file_content = ""
uploaded_file_name = ""
file_type = ""
retriever = None

custom_css = """..."""  # Your existing CSS here

class FileProcessor:
    @staticmethod
    def extract_ocr_df(img):
        data = pytesseract.image_to_data(img, lang='ara', config='--psm 6', output_type=pytesseract.Output.DATAFRAME)
        data = data.dropna().query("text != ''").reset_index(drop=True)
        return data

    @staticmethod
    def regrouper_lignes_par_y(df, seuil=15):
        lignes = []
        ligne_courante = []
        y_ref = None

        for _, mot in df.sort_values(by="top").iterrows():
            if y_ref is None or abs(mot["top"] - y_ref) <= seuil:
                ligne_courante.append(mot["text"])
            else:
                lignes.append(ligne_courante)
                ligne_courante = [mot["text"]]
            y_ref = mot["top"]

        if ligne_courante:
            lignes.append(ligne_courante)

        return lignes

    @staticmethod
    def lignes_en_json(lignes, colonnes):
        tableau = []
        for ligne in lignes:
            if len(ligne) >= len(colonnes):
                obj = {k: ligne[i] for i, k in enumerate(colonnes)}
                tableau.append(obj)
        return tableau

    @staticmethod
    def analyse_pdf(pdf_path):
        if not PDF2IMG_SUPPORT:
            return {"error": "pdf2image is not installed"}

        pages = convert_from_path(pdf_path, dpi=300)
        textes = []
        tableaux1 = []
        tableaux2 = []

        for i, img in enumerate(pages):
            df = FileProcessor.extract_ocr_df(img)
            lignes = FileProcessor.regrouper_lignes_par_y(df)

            if i == 0:
                text = " ".join([" ".join(l) for l in lignes])
                textes.append(text)

                idx_start = next((idx for idx, l in enumerate(lignes) if "سوية" in " ".join(l)), None)
                if idx_start is not None:
                    tableaux1 = FileProcessor.lignes_en_json(lignes[idx_start+1:idx_start+6], [
                        "عدد الرتبي للمالك", "موضوع الملكية", "مراجع الترسيم", "عدد السند المسلم"
                    ])

            if i == 1:
                idx_start = next((idx for idx, l in enumerate(lignes) if "التحملات" in " ".join(l)), None)
                if idx_start is not None:
                    tableaux2 = FileProcessor.lignes_en_json(lignes[idx_start+1:idx_start+6], [
                        "هوية المستفيد من التحملات", "نوع التحمل", "الحق الموظف عليه التحمل", 
                        "القيمة بالدينار", "الفائض (%)", "مراجع الترسيم"
                    ])

        return {
            "generalInformationText": " ".join(textes),
            "سوية المالكين": tableaux1,
            "التحملات": tableaux2
        }

    @staticmethod
    def read_text_file(path):
        for enc in ['utf-8', 'utf-16', 'latin1', 'cp1252']:
            try:
                with open(path, 'r', encoding=enc) as f:
                    return f.read()
            except UnicodeDecodeError:
                continue
        raise Exception("Failed decoding file")

    @staticmethod
    def read_pdf_file(path):
        if not PDF_SUPPORT:
            return "PDF support unavailable"
        try:
            with open(path, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                text = "\n".join([p.extract_text() or "" for p in reader.pages])

            if not text.strip() and PDF2IMG_SUPPORT:
                pages = convert_from_path(path, poppler_path=r"C:\\Release-24.08.0-0 (1)\\poppler-24.08.0\\Library\\bin")
                text = ""
                for img in pages:
                    text += pytesseract.image_to_string(img, lang='ara') + "\n"
            return text
        except Exception as e:
            return f"PDF read error: {str(e)}"

    @staticmethod
    def read_docx_file(path):
        if not DOCX_SUPPORT:
            return "DOCX support unavailable"
        doc = docx.Document(path)
        return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])

    @staticmethod
    def read_csv_file(path):
        with open(path, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            return "\n".join([" | ".join(row) for row in reader])

    @staticmethod
    def read_json_file(path):
        with open(path, 'r', encoding='utf-8') as f:
            return json.dumps(json.load(f), indent=2, ensure_ascii=False)

    @staticmethod
    def read_image_file(path):
        try:
            img = Image.open(path)
            text = pytesseract.image_to_string(img, lang='ara', config='--psm 6')
            return text
        except Exception as e:
            return f"OCR error: {str(e)}"

def read_file_content(path):
    ext = os.path.splitext(path)[1].lower()
    processor = FileProcessor()
    if ext == ".txt":
        return processor.read_text_file(path), "Text"
    elif ext == ".pdf":
        return processor.read_pdf_file(path), "PDF"
    elif ext in [".docx", ".doc"]:
        return processor.read_docx_file(path), "Word"
    elif ext == ".csv":
        return processor.read_csv_file(path), "CSV"
    elif ext == ".json":
        return processor.read_json_file(path), "JSON"
    elif ext in [".png", ".jpg", ".jpeg"]:
        return processor.read_image_file(path), "Image"
    return processor.read_text_file(path), "Unknown"

def prepare_rag_context(raw_text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    docs = splitter.create_documents([raw_text])
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = Chroma.from_documents(docs, embedding=embeddings)
    return vectorstore

def call_groq_api(prompt, api_key, model, temperature, rag_db=None):
    if rag_db:
        docs = rag_db.similarity_search(prompt, k=4)
        context = "\n\n".join([doc.page_content for doc in docs])
        prompt = f"""You are an AI assistant helping analyze documents.\n\nCONTEXT:\n{context}\n\nQUESTION:\n{prompt}"""

    if "gemini" in model.lower():
        url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent"
        headers = {"Content-Type": "application/json"}
        data = {
            "contents": [{"parts": [{"text": prompt}]}],
            "generationConfig": {"temperature": temperature, "topK": 1, "topP": 1, "maxOutputTokens": 2048}
        }
        try:
            response = requests.post(f"{url}?key={api_key}", headers=headers, json=data, timeout=30)
            return response.json()["candidates"][0]["content"]["parts"][0]["text"]
        except Exception as e:
            return f"Gemini API Error: {str(e)}"
    else:
        url = "https://api.groq.com/openai/v1/chat/completions"
        headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
        data = {
            "messages": [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            "model": model,
            "temperature": temperature,
            "max_tokens": 2048,
            "stream": False
        }
        try:
            response = requests.post(url, headers=headers, json=data, timeout=30)
            return response.json()["choices"][0]["message"]["content"]
        except Exception as e:
            return f"Groq API Error: {str(e)}"

def handle_file_upload(file):
    global uploaded_file_content, uploaded_file_name, file_type, retriever
    if not file:
        uploaded_file_content = ""
        uploaded_file_name = ""
        file_type = ""
        retriever = None
        return "No file uploaded", "", "📄 No document loaded"

    uploaded_file_name = os.path.basename(file.name)
    content, detected = read_file_content(file.name)
    uploaded_file_content = content
    file_type = detected
    retriever = prepare_rag_context(content)
    preview = content[:1000] + ("\n..." if len(content) > 1000 else "")

    status_message = f"✅ Successfully loaded: {uploaded_file_name}"
    file_info = f"📄 Document loaded: {uploaded_file_name} ({file_type})"

    return status_message, preview, file_info

def show_raw_ocr():
    return uploaded_file_content or "No OCR content available"

def groq_interface(prompt, api_key, model, temperature, use_rag):
    if not prompt.strip():
        return "❌ Please enter a prompt."
    if not api_key.strip():
        return "❌ Please enter your API key."
    if use_rag and not retriever:
        return "❌ Please upload a document to use RAG retrieval."
    return call_groq_api(prompt, api_key, model, temperature, retriever if use_rag else None)

if __name__ == "__main__":
    with gr.Blocks(css=custom_css, title="Groq RAG Chatbot") as app:
        gr.HTML("<h1>🚀 Groq & Gemini RAG Chatbot</h1>")

        with gr.Row():
            with gr.Column(scale=1):
                file_upload = gr.File(label="Upload Document", file_types=[".txt", ".pdf", ".docx", ".csv", ".json", ".png", ".jpg", ".jpeg"], type="filepath")
                file_status = gr.Textbox(label="Status", interactive=False)
                preview = gr.Textbox(label="Document Preview", lines=8, interactive=False)
                api_key = gr.Textbox(label="API Key", type="password")
                model = gr.Dropdown(GROQ_MODELS, value="llama-3.1-8b-instant", label="Model")
                temp = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.7, label="Temperature")
                use_rag = gr.Checkbox(label="Use RAG Retrieval", value=True)

            with gr.Column(scale=2):
                document_info = gr.Textbox(label="Document Status", value="📄 No document loaded", interactive=False)
                prompt = gr.Textbox(label="Your Question", lines=3)
                response = gr.Textbox(label="AI Response", lines=12, interactive=False)
                send = gr.Button("Send Message")

        file_upload.change(fn=handle_file_upload, inputs=[file_upload], outputs=[file_status, preview, document_info])
        send.click(fn=groq_interface, inputs=[prompt, api_key, model, temp, use_rag], outputs=[response])
        prompt.submit(fn=groq_interface, inputs=[prompt, api_key, model, temp, use_rag], outputs=[response])
        show_ocr_btn = gr.Button("👁️ Show Raw OCR Text")
        show_ocr_btn.click(fn=show_raw_ocr, inputs=[], outputs=[response])

    app.launch(server_name="0.0.0.0", server_port=None, inbrowser=True, share=False)


ERROR:    [Errno 10048] error while attempting to bind on address ('0.0.0.0', 7860): [winerror 10048] only one usage of each socket address (protocol/network address/port) is normally permitted


* Running on local URL:  http://0.0.0.0:7861
* To create a public link, set `share=True` in `launch()`.


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
