In [None]:
pip install -U bitsandbytes

In [None]:
# Run in a Kaggle cell. If Kaggle already has some packages, pip will skip/reinstall harmlessly.

# NOTE: Tesseract + language packs may need system installs. Kaggle usually has tesseract installed.
# If you have sudo on your environment (not usually on Kaggle), run:
# sudo apt-get update && sudo apt-get install -y tesseract-ocr tesseract-ocr-hin tesseract-ocr-mar

!pip install --upgrade pip
!pip install pytesseract pillow python-docx python-pptx pdf2image pymupdf googletrans==4.0.0-rc1 sentence-transformers faiss-cpu transformers

In [None]:
import os
import io
import zipfile
import tempfile
import pickle
from pathlib import Path
from typing import List, Dict, Tuple

# PDF and images
import fitz  # PyMuPDF
from pdf2image import convert_from_path
from PIL import Image
import pytesseract

# Office docs
from docx import Document
from pptx import Presentation
!pip install openpyxl xlrd

# Translation
from googletrans import Translator

# Embeddings + FAISS
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Config - change these paths to your environment
ROOT = Path("/kaggle/input/aaple-sarkar-db")   # <-- keep your same data path here
INDEX_DIR = Path("faiss_index")                   # where index + metadata saved
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
TESSERACT_LANGS = "eng+hin+mar"  # English, Hindi, Marathi

# Ensure index dir exists
INDEX_DIR.mkdir(parents=True, exist_ok=True)

translator = Translator()
# Choose a sentence-transformer model (fast & good)
EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
emb_model = SentenceTransformer(EMB_MODEL_NAME)

In [None]:
def clean_text(s: str) -> str:
    if not s:
        return ""
    return " ".join(s.split()).strip()

def chunk_text(text: str, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP) -> List[str]:
    text = clean_text(text)
    if not text:
        return []
    chunks = []
    start = 0
    L = len(text)
    while start < L:
        end = min(start + chunk_size, L)
        chunk = text[start:end]
        chunks.append(chunk)
        if end == L:
            break
        start = end - overlap
    return chunks

In [None]:
def ocr_pil_image(img: Image.Image, langs: str = TESSERACT_LANGS) -> str:
    if img.mode != "RGB":
        img = img.convert("RGB")
    try:
        text = pytesseract.image_to_string(img, lang=langs)
    except Exception as e:
        # fallback to default lang
        text = pytesseract.image_to_string(img)
    return clean_text(text)

def ocr_image_file(path: Path) -> str:
    img = Image.open(str(path))
    return ocr_pil_image(img)

In [None]:
def extract_text_from_pdf(path: Path) -> str:
    doc = fitz.open(str(path))
    page_texts = []
    for i in range(len(doc)):
        page = doc[i]
        text = page.get_text("text")
        if text and text.strip():
            page_texts.append(text)
        else:
            # scanned page -> render and OCR
            pix = page.get_pixmap(dpi=200)
            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
                tmp.write(pix.tobytes("png"))
                tmp.flush()
                tmp_path = Path(tmp.name)
            try:
                page_texts.append(ocr_image_file(tmp_path))
            finally:
                try:
                    tmp_path.unlink()
                except Exception:
                    pass
    return clean_text("\n".join(page_texts))

In [None]:
import pandas as pd
EXCEL_EXTS = {".xlsx", ".xls"}

def extract_text_from_excel(path: Path) -> str:
    try:
        # Read all sheets
        xls = pd.ExcelFile(path)
        all_texts = []
        for sheet in xls.sheet_names:
            df = pd.read_excel(xls, sheet_name=sheet, dtype=str)  # read as string
            # Flatten the dataframe → join as text
            text = "\n".join(
                df.fillna("").astype(str).apply(lambda row: " | ".join(row), axis=1)
            )
            if text.strip():
                all_texts.append(f"[Sheet: {sheet}]\n{text}")
        return clean_text("\n".join(all_texts))
    except Exception as e:
        print(f"[WARN] Excel read failed {path}: {e}")
        return ""

In [None]:
def extract_text_from_docx(path: Path) -> str:
    # Try normal text extraction first
    try:
        doc = Document(str(path))
        paragraphs = [p.text for p in doc.paragraphs if p.text and p.text.strip()]
        if paragraphs:
            return clean_text("\n".join(paragraphs))
    except Exception:
        pass

    # fallback: extract images from docx zip and OCR them
    texts = []
    try:
        with zipfile.ZipFile(str(path), 'r') as z:
            media_files = [f for f in z.namelist() if f.startswith("word/media/")]
            for mf in media_files:
                data = z.read(mf)
                try:
                    img = Image.open(io.BytesIO(data))
                    texts.append(ocr_pil_image(img))
                except Exception:
                    continue
    except Exception:
        pass
    return clean_text("\n".join(texts))

def extract_text_from_pptx(path: Path) -> str:
    try:
        prs = Presentation(str(path))
        slide_texts = []
        slide_img_texts = []
        for slide in prs.slides:
            # textual shapes
            for shape in slide.shapes:
                if hasattr(shape, "text") and shape.text and shape.text.strip():
                    slide_texts.append(shape.text)
            # image shapes: some shapes are pictures
            for shape in slide.shapes:
                try:
                    if shape.shape_type == 13:  # picture
                        img = shape.image
                        data = img.blob
                        pil = Image.open(io.BytesIO(data))
                        slide_img_texts.append(ocr_pil_image(pil))
                except Exception:
                    pass
        if slide_texts:
            return clean_text("\n".join(slide_texts))
        return clean_text("\n".join(slide_img_texts))
    except Exception:
        # final fallback: attempt to unzip media
        texts = []
        try:
            with zipfile.ZipFile(str(path), 'r') as z:
                media_files = [f for f in z.namelist() if f.startswith("ppt/media/")]
                for mf in media_files:
                    data = z.read(mf)
                    try:
                        img = Image.open(io.BytesIO(data))
                        texts.append(ocr_pil_image(img))
                    except Exception:
                        continue
        except Exception:
            pass
        return clean_text("\n".join(texts))

In [None]:
def translate_to_english(text: str) -> str:
    text = clean_text(text)
    if not text:
        return ""
    try:
        res = translator.translate(text, dest="en")
        return clean_text(res.text)
    except Exception:
        return text

IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"}
DOC_EXTS = {".pdf", ".docx", ".pptx", ".txt"} | EXCEL_EXTS

def load_file_to_text(path: Path) -> str:
    ext = path.suffix.lower()
    extracted = ""
    try:
        if ext == ".pdf":
            extracted = extract_text_from_pdf(path)
        elif ext in IMAGE_EXTS:
            extracted = ocr_image_file(path)
        elif ext == ".docx":
            extracted = extract_text_from_docx(path)
        elif ext == ".pptx":
            extracted = extract_text_from_pptx(path)
        elif ext in EXCEL_EXTS:
            extracted = extract_text_from_excel(path)
        elif ext == ".txt":
            extracted = path.read_text(encoding="utf-8", errors="ignore")
        else:
            # fallback: try docx/pptx/image
            try:
                extracted = extract_text_from_docx(path)
            except:
                try:
                    extracted = extract_text_from_pptx(path)
                except:
                    try:
                        extracted = ocr_image_file(path)
                    except:
                        extracted = ""
    except Exception as e:
        print(f"[WARN] Failed to extract {path}: {e}")
        extracted = ""

    extracted = clean_text(extracted)
    if not extracted:
        return ""
    return translate_to_english(extracted)

In [None]:
# Save files
INDEX_FILE = INDEX_DIR / "index.faiss"
META_FILE = INDEX_DIR / "meta.pkl"

def build_index(root: Path):
    texts = []
    metadata = []
    print("Scanning files and extracting text (this may take time)...")
    for p in root.rglob("*"):
        if not p.is_file():
            continue
        if p.suffix.lower() not in DOC_EXTS and p.suffix.lower() not in IMAGE_EXTS:
            continue
        text = load_file_to_text(p)
        if not text:
            continue
        chunks = chunk_text(text)
        for i, chunk in enumerate(chunks):
            texts.append(chunk)
            metadata.append({"source": str(p), "chunk_id": i})

    if not texts:
        raise RuntimeError("No text found. Check ROOT path or extraction process.")

    print(f"Encoding {len(texts)} chunks using {EMB_MODEL_NAME} ...")
    embeddings = emb_model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    norms[norms == 0] = 1e-10
    embeddings = embeddings / norms

    dim = embeddings.shape[1]
    faiss_index = faiss.IndexFlatIP(dim)
    faiss_index.add(embeddings.astype('float32'))

    faiss.write_index(faiss_index, str(INDEX_FILE))
    with open(META_FILE, "wb") as f:
        pickle.dump({"texts": texts, "metadata": metadata}, f)
    print("Index built and saved:", INDEX_FILE, META_FILE)
    return faiss_index, texts, metadata

def load_index():
    if not INDEX_FILE.exists() or not META_FILE.exists():
        return None, None, None
    faiss_index = faiss.read_index(str(INDEX_FILE))
    with open(META_FILE, "rb") as f:
        data = pickle.load(f)
    return faiss_index, data["texts"], data["metadata"]

# Build or load
faiss_index, index_texts, index_metadata = load_index()
if faiss_index is None:
    faiss_index, index_texts, index_metadata = build_index(ROOT)

In [None]:
def retrieve(query: str, k: int = 4) -> List[Dict]:
    q_emb = emb_model.encode([query], convert_to_numpy=True)
    q_emb = q_emb / (np.linalg.norm(q_emb, axis=1, keepdims=True) + 1e-10)
    q_emb = q_emb.astype('float32')
    D, I = faiss_index.search(q_emb, k)  # ✅ use faiss_index
    results = []
    for score, idx in zip(D[0], I[0]):
        if idx < 0 or idx >= len(index_texts):
            continue
        results.append({
            "score": float(score),
            "text": index_texts[idx],
            "metadata": index_metadata[idx]
        })
    return results

# Example:
q = "What is country?"
res = retrieve(q, k=3)
for r in res:
    print("SCORE:", r["score"], "SOURCE:", r["metadata"]["source"])
    print(r["text"][:400], "...\n")

In [None]:
pip install bitsandbytes accelerate

In [None]:
pip install -U bitsandbytes

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig

# 1. Load Zephyr in 4-bit quantized mode (saves a LOT of memory)
def init_zephyr_lowmem(model_name="HuggingFaceH4/zephyr-7b-alpha"):
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quant_config,
        device_map="auto"
    )
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
    return pipe

zephyr_pipe = init_zephyr_lowmem()

# 2. Build prompt
def assemble_context_and_prompt(query: str, retrieved: list) -> str:
    context_parts = []
    for i, r in enumerate(retrieved):
        src = r["metadata"].get("source", "unknown")
        snippet = r["text"][:500]  # limit per chunk to avoid OOM
        context_parts.append(f"Source {i+1} ({src}):\n{snippet}")
    context = "\n\n".join(context_parts)
    return f"""You are Zephyr, a helpful assistant.
Use the following context to answer the question as accurately as possible.
If the answer is not present, say "I don't know from the provided documents."

Context:
{context}

Question:
{query}

Answer:"""

# 3. Call Zephyr
def call_zephyr(prompt_text: str, max_new_tokens=256) -> str:
    outputs = zephyr_pipe(
        prompt_text,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        temperature=0
    )
    return outputs[0]["generated_text"][len(prompt_text):].strip()

In [None]:
def answer_query(query: str, k: int = 4) -> str:
    retrieved = retrieve(query, k=k)
    prompt_text = assemble_context_and_prompt(query, retrieved)
    return call_zephyr(prompt_text)

In [None]:
question = "What is country?"
answer = answer_query(question, k=3)
print("Q:", question)
print("A:", answer)

In [None]:
!pip install flask flask-cors pyngrok

In [None]:
from pyngrok import ngrok

# 🔑 paste your token here
NGROK_AUTH_TOKEN = ""

# set auth token
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

In [None]:
from flask import Flask, request, jsonify, render_template_string
from pyngrok import ngrok   # make sure you install with: pip install flask pyngrok


app = Flask(__name__)

# Professional Chat UI
chat_ui = """
<!DOCTYPE html>
<html>
<head>
    <title>Zephyr Chatbot</title>
    <style>
        body {
            margin: 0;
            height: 100vh;
            display: flex;
            justify-content: center;
            align-items: center;
            background: #f3f4f6;
            font-family: 'Segoe UI', Tahoma, sans-serif;
        }
        .chat-wrapper {
            width: 420px;
            height: 600px;
            background: white;
            border-radius: 16px;
            box-shadow: 0 8px 25px rgba(0,0,0,0.15);
            display: flex;
            flex-direction: column;
            overflow: hidden;
        }
        .chat-header {
            background: linear-gradient(135deg, #2563eb, #1e40af);
            color: white;
            padding: 14px 20px;
            font-size: 18px;
            font-weight: 600;
            display: flex;
            align-items: center;
            gap: 10px;
        }
        .chat-header img {
            width: 28px; height: 28px;
        }
        #chat-box {
            flex: 1;
            padding: 20px;
            overflow-y: auto;
            background: #f9fafb;
        }
        .message {
            max-width: 75%;
            padding: 12px 16px;
            margin: 8px 0;
            border-radius: 18px;
            line-height: 1.4;
            word-wrap: break-word;
            animation: fadeIn 0.2s ease-in-out;
        }
        .user {
            margin-left: auto;
            background: #2563eb;
            color: white;
            border-bottom-right-radius: 4px;
        }
        .bot {
            margin-right: auto;
            background: #e5e7eb;
            color: #111827;
            border-bottom-left-radius: 4px;
        }
        .chat-input {
            display: flex;
            padding: 12px;
            border-top: 1px solid #ddd;
            background: white;
        }
        .chat-input input {
            flex: 1;
            border: none;
            padding: 12px;
            font-size: 15px;
            border-radius: 25px;
            outline: none;
            background: #f3f4f6;
        }
        .chat-input button {
            margin-left: 10px;
            padding: 0 18px;
            border: none;
            border-radius: 25px;
            background: #2563eb;
            color: white;
            font-weight: 500;
            cursor: pointer;
            transition: background 0.2s;
        }
        .chat-input button:hover {
            background: #1e40af;
        }
        @keyframes fadeIn {
            from {opacity: 0; transform: translateY(5px);}
            to {opacity: 1; transform: translateY(0);}
        }
        .typing {
            margin: 8px 0;
            padding: 12px 16px;
            border-radius: 18px;
            background: #e5e7eb;
            color: #6b7280;
            font-style: italic;
            font-size: 14px;
            width: fit-content;
        }
    </style>
</head>
<body>
    <div class="chat-wrapper">
        <div class="chat-header">
            <img src="https://img.icons8.com/color/48/robot-2.png"/>
            Zephyr Chatbot
        </div>
        <div id="chat-box"></div>
        <div class="chat-input">
            <input id="user-input" type="text" placeholder="Type a message..." onkeydown="if(event.key==='Enter') sendMessage()" />
            <button onclick="sendMessage()">Send</button>
        </div>
    </div>

    <script>
        async function sendMessage() {
            let input = document.getElementById("user-input");
            let chatBox = document.getElementById("chat-box");
            let userMsg = input.value.trim();
            if (!userMsg) return;

            chatBox.innerHTML += "<div class='message user'>" + userMsg + "</div>";
            input.value = "";
            chatBox.scrollTop = chatBox.scrollHeight;

            // Add typing indicator
            let typingDiv = document.createElement("div");
            typingDiv.className = "typing";
            typingDiv.innerText = "Bot is typing...";
            chatBox.appendChild(typingDiv);
            chatBox.scrollTop = chatBox.scrollHeight;

            let response = await fetch("/ask", {
                method: "POST",
                headers: { "Content-Type": "application/json" },
                body: JSON.stringify({query: userMsg})
            });

            let data = await response.json();

            // Remove typing indicator
            chatBox.removeChild(typingDiv);

            // Add bot response
            chatBox.innerHTML += "<div class='message bot'>" + data.answer + "</div>";
            chatBox.scrollTop = chatBox.scrollHeight;
        }
    </script>
</body>
</html>

"""

@app.route("/")
def index():
    return render_template_string(chat_ui)

@app.route("/ask", methods=["POST"])
def ask():
    user_query = request.json.get("query", "")
    try:
        answer = answer_query(user_query)  # ✅ your existing logic
    except Exception as e:
        answer = f"Error: {str(e)}"
    return jsonify({"answer": answer})

# Start ngrok tunnel + Flask
public_url = ngrok.connect(5000)
print("🚀 Chatbot UI is live at:", public_url)

app.run(port=5000)