In [None]:
!pip install streamlit pymupdf langchain faiss-cpu sentence-transformers transformers langdetect



In [None]:
!pip install streamlit PyMuPDF  langdetect langchain sentence-transformers faiss-cpu transformers torch sentencepiece



In [None]:
pip install --upgrade langchain-community




In [None]:
!pip install huggingface-hub

from huggingface_hub import login
login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
%%writefile app.py
import os
import re
import fitz
import streamlit as st
import logging
import torch

from langdetect import detect
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain import PromptTemplate
from transformers import (
    pipeline,
    M2M100Tokenizer,
    M2M100ForConditionalGeneration,
    AutoTokenizer,
    AutoModelForCausalLM
)

# ─── Logging Setup ─────────────────────────────────────────────────────────────
logging.basicConfig(
    filename="chat_logs.txt",
    level=logging.INFO,
    format="%(asctime)s %(message)s"
)

# ─── Translation & Chunking Setup ──────────────────────────────────────────────
translator_tok   = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
translator_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
splitter         = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=100)

def translate(text, src, tgt):
    translator_tok.src_lang = src
    toks = translator_tok(text, return_tensors="pt")
    gen = translator_model.generate(
        **toks,
        forced_bos_token_id=translator_tok.get_lang_id(tgt)
    )
    return translator_tok.batch_decode(gen, skip_special_tokens=True)[0]

def load_pages(files):
    pages = []
    for f in files:
        doc = fitz.open(stream=f.read(), filetype="pdf")
        for i, p in enumerate(doc, 1):
            txt = re.sub(r'^\s*Page\s+\d+\s*$', '', p.get_text(), flags=re.MULTILINE)
            pages.append({"page": i, "text": txt})
    return pages

def make_chunks(pages):
    texts = [p["text"] for p in pages]
    metas = [{"page": p["page"]} for p in pages]
    return splitter.create_documents(texts, metadatas=metas)

# ─── Prompt for RetrievalQA ─────────────────────────────────────────────────────
PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template=(
        "Answer the question using only the context below.\n"
        "If you don't know, just say \"Sorry, I don't know.\"\n\n"
        "CONTEXT:\n{context}\n\n"
        "QUESTION: {question}\n\n"
        "ANSWER:"
    )
)

# ─── Streamlit App UI ────────────────────────────────────────────────────────────
st.set_page_config(page_title="PDF Assistant 🤖", layout="wide")
st.title("📄 PDF Assistant with MMR RetrievalQA & Llama 2 7B Chat")
st.info("👋 **Welcome!** Upload PDFs and ask questions in any language.")
st.markdown("""
- **Upload** PDF files
- **Extract & chunk** text
- **Embed** with all‑mpnet-base-v2 + FAISS
- **MMR** for diverse context
- **Llama 2 7B Chat** for generation
- **Apology** if no match
- **Logs** kept in `chat_logs.txt`
""")

# Hugging Face token for gated repo access
hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")

files = st.file_uploader("Upload PDFs", type="pdf", accept_multiple_files=True)
if files:
    with st.spinner("Processing PDFs…"):
        pages  = load_pages(files)
        chunks = make_chunks(pages)

        embedder  = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
        db        = FAISS.from_documents(chunks, embedder)
        retriever = db.as_retriever(search_type="mmr", search_kwargs={"k":3, "fetch_k":10})

        # Load Llama‑2‑7b‑chat‑hf
        tokenizer = AutoTokenizer.from_pretrained(
            "meta-llama/Llama-2-7b-chat-hf",
            use_fast=False,
            use_auth_token=hf_token
        )
        model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Llama-2-7b-chat-hf",
            device_map="auto",
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            use_auth_token=hf_token
        )
        llm_pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=512,
            temperature=0.7
        )
        llm = HuggingFacePipeline(pipeline=llm_pipe)

        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=retriever,
            return_source_documents=False,
            chain_type_kwargs={"prompt": PROMPT}
        )
    st.success("✅ Index built—ready for questions!")

    q = st.text_input("Your question:")
    if q:
        lang = detect(q)
        q_en = translate(q, lang, "en") if lang != "en" else q

        # 1) Quick MMR check
        docs_and_scores = db.similarity_search_with_score(q_en, k=5)
        if not docs_and_scores or docs_and_scores[0][1] < 0.2:
            st.warning("Sorry, I couldn't find anything related to your query.")
            logging.info(f"Q: {q} | A:<no match>")
            st.stop()

        # 2) Generate answer
        answer_en = qa_chain.run(q_en)
        answer = translate(answer_en, "en", lang) if lang != "en" else answer_en

        # 3) List source pages
        pages = sorted({doc.metadata["page"] for doc, _ in docs_and_scores})
        st.markdown(f"**Answer:** {answer}")
        st.markdown("**Pages:**")
        for p in pages:
            st.write(f"- Page {p}")

        logging.info(f"Q: {q} | A:{answer} | Pages:{pages}")

Overwriting app.py


In [None]:
!npm install localtunnel

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K
up to date, audited 23 packages in 1s
[1G[0K⠙[1G[0K
[1G[0K⠙[1G[0K3 packages are looking for funding
[1G[0K⠙[1G[0K  run `npm fund` for details
[1G[0K⠙[1G[0K
2 [31m[1mhigh[22m[39m severity vulnerabilities

To address all issues (including breaking changes), run:
  npm audit fix --force

Run `npm audit` for details.
[1G[0K⠙[1G[0K

In [1]:
! curl https://loca.lt/mytunnelpassword

In [None]:
!streamlit run app.py &>/content/logs.txt & npx localtunnel --port 8501

[1G[0K⠙[1G[0K⠹[1G[0Kyour url is: https://light-parks-pull.loca.lt
