In [None]:
# ✅ Install dependencies
!pip install gradio pymupdf langchain faiss-cpu transformers sentencepiece \
    openai groq langchain-community python-dotenv langchain-groq

# ✅ Imports
import gradio as gr
import fitz  # PyMuPDF
import os
import re
from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from langchain_groq import ChatGroq

# ✅ Load .env file
load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY")
os.environ["GROQ_API_KEY"] = groq_api_key

# ✅ Extract PDF text
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

# ✅ Search FAISS vector DB
def search_similar_chunks(vector_db, query, top_k=3):
    try:
        results = vector_db.similarity_search(query, k=top_k)
        if not results:
            return "⚠️ No relevant content found for your topic."

        keywords = query.lower().split()
        styled_chunks = ""

        for res in results:
            chunk = res.page_content.strip()
            for kw in keywords:
                if len(kw) >= 3:
                    chunk = re.sub(rf"\b({re.escape(kw)})\b", r"**\1**", chunk, flags=re.IGNORECASE)
            styled_chunks += f"{chunk}\n\n---\n\n"

        return styled_chunks.strip()

    except Exception as e:
        return f"❌ [Error fetching similar chunks]: {str(e)}"

# ✅ Store embeddings in FAISS
def store_embeddings(text_chunks):
    embedder = HuggingFaceEmbeddings()
    vector_db = FAISS.from_texts(text_chunks, embedding=embedder)
    return vector_db

# ✅ Generate blog with GROQ
def generate_blog(text_chunks):
    llm = ChatGroq(
        model_name="llama3-8b-8192",
        temperature=0.3
    )

    prompt = PromptTemplate(
        input_variables=["content"],
        template="""
You are a professional AI writer. Write a short, blog-style summary of this research paper with 3 sections:
1. Problem
2. Methodology
3. Key Takeaways

Be concise and engaging and make it look good with Bold Text and Highlights.
Use emojis and write like a proper blog.
Do not use markdown symbols like ## or **.

Problem:
{content}

Methodology:
{content}

Key Takeaways:
{content}

Research Content:
{content}
"""
    )

    chain = LLMChain(llm=llm, prompt=prompt)
    return chain.run(content="\n".join(text_chunks[:3]))

# ✅ Load NLLB model for translation
nllb_model_id = "facebook/nllb-200-distilled-600M"
nllb_tokenizer = AutoTokenizer.from_pretrained(nllb_model_id)
nllb_model = AutoModelForSeq2SeqLM.from_pretrained(nllb_model_id)

LANG_CODE_MAP = {
    "en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva", "ta": "tam_Taml",
    "bn": "ben_Beng", "gu": "guj_Gujr", "kn": "kan_Knda", "ml": "mal_Mlym",
    "te": "tel_Telu", "ur": "urd_Arab", "es": "spa_Latn", "fr": "fra_Latn", "de": "deu_Latn"
}

# ✅ Translate blog
def translate_to(text, target_lang="hi"):
    try:
        src_lang = "eng_Latn"
        tgt_lang = LANG_CODE_MAP.get(target_lang)

        if not tgt_lang:
            return f"[⚠️ Error] Language '{target_lang}' is not supported."

        translation_pipeline = pipeline(
            "translation",
            model=nllb_model,
            tokenizer=nllb_tokenizer,
            src_lang=src_lang,
            tgt_lang=tgt_lang,
            max_length=512
        )

        result = translation_pipeline(text)
        return result[0]['translation_text']

    except Exception as e:
        return f"[Translation Error] {str(e)}"

# ✅ Main Gradio function
def process_pdf(file, topic="", lang=""):
    try:
        pdf_path = file.name
        text = extract_text_from_pdf(pdf_path)

        chunks = [c.strip() for c in text.split(". ") if c.strip()]
        db = store_embeddings(chunks)
        raw_blog = generate_blog(chunks)

        blog = ""
        if "Problem" in raw_blog:
            content = raw_blog.split("Problem")[1].split("Methodology")[0].strip()
            blog += " 🧩 Problem Statement\n\n" + f"{content}\n\n"

        if "Methodology" in raw_blog:
            content = raw_blog.split("Methodology")[1].split("Key Takeaways")[0].strip()
            blog += " 🧪 Methodology\n\n" + f"{content}\n\n"

        if "Key Takeaways" in raw_blog:
            content = raw_blog.split("Key Takeaways")[1].strip()
            blog += " 🌟 Key Takeaways\n\n"
            for line in content.split("\n"):
                blog += f"• {line.strip()}\n" if line.strip() else ""

        if lang and lang != "en":
            blog = translate_to(blog, target_lang=lang)

        related = search_similar_chunks(db, topic) if topic else "No topic entered."

        with open("blog.md", "w", encoding="utf-8") as f:
            f.write(blog)

        return blog, related, "blog.md"

    except Exception as e:
        return f"❌ Error: {str(e)}", f"❌ Error: {str(e)}", None

# ✅ Launch UI
gr.Interface(
    fn=process_pdf,
    inputs=[
        gr.File(label="📄 Upload Research PDF"),
        gr.Textbox(label="🔍 Enter Topic for Similar Search (optional)"),
        gr.Textbox(label="🌐 Translate to (e.g. hi, es, fr)", placeholder="en, hi, es etc.")
    ],
    outputs=[
        gr.Textbox(label="📝 Blog Summary", lines=20),
        gr.Textbox(label="💡 Similar Chunks"),
        gr.File(label="⬇️ Download Blog as .md")
    ],
    title="📚 AI-Powered Research Blog Explainer (GROQ)",
    description="Upload a research PDF → Get a structured blog summary → Translate it → Download as .md",
    theme="default"
).launch()