In [None]:
# --- STEP 1: ENVIRONMENT SETUP ---
# This cell installs all required libraries for the project.
# - langchain & components: For managing the RAG pipeline.
# - transformers, accelerate, peft, bitsandbytes: For loading and running the quantized LLM.
# - faiss-cpu: For the vector database.
# - arxiv & pypdf: For fetching and processing research papers


!pip install -q \
langchain==0.1.20 \
langchain-community==0.0.38 \
langchain-core==0.1.52 \
transformers accelerate peft bitsandbytes \
sentence-transformers faiss-cpu pypdf arxiv


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m93.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.9/302.9 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m114.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.6/329.6 kB[0m [31m32.7 MB/s[0m eta [36m0:00

In [None]:
# --- STEP 2: IMPORT DEPENDENCIES ---
# Importing necessary modules from LangChain to handle:
# - RetrievalQA: The chain that connects the LLM to the database.
# - Embeddings & Vectorstores: To turn text into math and search it.

from langchain.chains import RetrievalQA
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader

print("LangChain imports OK ✅")


LangChain imports OK ✅


In [None]:
# --- STEP 3: EXTRACT FINE-TUNED ADAPTERS ---
# We extract the 'my_pirate_adapters.zip' file. 
# This contains the specific "LoRA" weights we trained to give the model its unique personality.

import zipfile

with zipfile.ZipFile("my_pirate_adapters.zip") as z:
    z.extractall("my_pirate_adapters")

print("Adapters extracted ✅")


Adapters extracted ✅


In [None]:
# --- STEP 4: LOAD LLM WITH ADAPTERS ---
# 1. Load the base Mistral-7B model using 4-bit quantization (to fit in memory).
# 2. Load our custom "Pirate" adapters using PeftModel.
# 3. Create a text-generation pipeline with specific parameters (temperature=0.2 for accuracy).

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel

base_model = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(base_model)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_4bit=True,
    device_map="auto"
)

model = PeftModel.from_pretrained(model, "my_pirate_adapters")
model.eval()

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.2
)

print("Fine-tuned LLM loaded ✅")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0


Fine-tuned LLM loaded ✅


In [None]:
# --- STEP 5: DYNAMIC DATA COLLECTION ---
# We use the arXiv API to search for the 20 most recent papers 
# in the 'Computation and Language' (cs.CL) category and download them.

import arxiv

search = arxiv.Search(
    query="cat:cs.CL",
    max_results=20,
    sort_by=arxiv.SortCriterion.SubmittedDate
)

pdfs = [r.download_pdf() for r in search.results()]
print("Downloaded", len(pdfs), "papers ✅")


  pdfs = [r.download_pdf() for r in search.results()]


Downloaded 20 papers ✅


In [None]:
# --- STEP 6: BUILD VECTOR DATABASE ---
# 1. Initialize the embedding model (MiniLM).
# 2. Safely encode each text chunk into a vector (handling potential errors).
# 3. Create a FAISS vector store to enable fast semantic search.


from sentence_transformers import SentenceTransformer
import numpy as np
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import FakeEmbeddings

# 1️⃣ Load embedding model
embedder = SentenceTransformer(
    "sentence-transformers/all-MiniLM-L6-v2",
    device="cuda"  # agar GPU na ho to "cpu"
)

good_texts = []
good_embeddings = []
good_metadatas = []

print("Encoding texts safely...")

# 2️⃣ SAFE per-text encoding (bulletproof)
for i, (text, meta) in enumerate(zip(texts, metadatas)):
    try:
        emb = embedder.encode(
            text,
            convert_to_numpy=True,
            normalize_embeddings=True
        )
        good_texts.append(text)
        good_embeddings.append(emb)
        good_metadatas.append(meta)
    except Exception as e:
        print(f"⚠️ Skipped text {i}: {type(e).__name__}")

good_embeddings = np.vstack(good_embeddings)

print("Original texts:", len(texts))
print("Valid texts:", len(good_texts))
print("Embeddings shape:", good_embeddings.shape)

# 3️⃣ FAISS (LangChain-compatible way)
fake_embeddings = FakeEmbeddings(size=good_embeddings.shape[1])

db = FAISS.from_texts(
    texts=good_texts,
    embedding=fake_embeddings,
    metadatas=good_metadatas
)

# 4️⃣ REAL embeddings inject karo
db.index.reset()
db.index.add(good_embeddings)

print("✅ Vector DB ready (FINAL & CORRECT)")


Encoding texts safely...
⚠️ Skipped text 1598: TypeError
Original texts: 1852
Valid texts: 1851
Embeddings shape: (1851, 384)
✅ Vector DB ready (FINAL & CORRECT)


In [None]:
# --- STEP 7: RUN STANDARD RAG QUERY ---
# We set up the RetrievalQA chain to:
# 1. Retrieve the top 4 relevant chunks from FAISS.
# 2. Pass them to our fine-tuned LLM to generate an answer.

from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

llm = HuggingFacePipeline(pipeline=pipe)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=db.as_retriever(search_kwargs={"k": 4}),
    chain_type="stuff"
)

query = "What recent transformer-based approaches are used in low-resource NLP?"
answer = qa.run(query)

print("ANSWER:\n")
print(answer)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


ANSWER:

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

PREPRINT
• Case λ= 0 :The weights become uniform with πm(0) = 1/M . The output distribution is
consistent with the original model. Given the Entropy Drift phenomenon, we have µcPr > µPr
whenλ= 0.
• Case λ→ ∞:The retaining probability of resampling steps concentrates on the particle with the
minimum entropy, that is,
lim
λ→∞
πm(λ) =
1,ifH m = mini Hi ,
0,otherwise.
This corresponds to Model Collapse, wherelim λ→∞ µcPr (λ)< µPr.
The function µcPr(λ) is continuous and strictly monotonically decreasing on the domain [0,∞) . The
reference entropy µPr lies strictly between the boundary values limλ→∞ µcPr(λ) and µcPr(0). By the
Intermediate Value Theorem, there exists aλ∗ such that µcPr(λ∗) =µ Pr. Furthermore, due to the
strict monotonicity of the function, thisλ ∗ is unique.
Remark.While Theorem 2 establishes λ as a prim

In [None]:
# --- STEP 8: DEFINE CITATION PROMPT ---
# We create a custom PromptTemplate that strictly instructs the model 
# to act as a research assistant and cite sources for every claim.


from langchain.prompts import PromptTemplate

CITATION_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a research assistant.

Answer the question ONLY using the provided context.
For every important claim, cite the source paper
using title or author from metadata in brackets.

Context:
{context}

Question:
{question}

Answer (with citations):
"""
)


In [None]:
# --- STEP 9: INITIALIZE CITATION CHAIN ---
# We re-initialize the RetrievalQA chain, this time passing our 
# custom CITATION_PROMPT to enforce academic rigor.



from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

llm = HuggingFacePipeline(pipeline=pipe)

qa_citation = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=db.as_retriever(search_kwargs={"k": 5}),
    chain_type="stuff",
    chain_type_kwargs={"prompt": CITATION_PROMPT},
    return_source_documents=True
)


In [None]:
# --- STEP 10: GENERATE CITED ANSWER ---
# We run the query through the citation chain and print the result.

query = "What recent transformer-based approaches are used in low-resource NLP?"

result = qa_citation(query)

print("ANSWER (WITH CITATIONS):\n")
print(result["result"])


  warn_deprecated(
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


ANSWER (WITH CITATIONS):


You are a research assistant.

Answer the question ONLY using the provided context.
For every important claim, cite the source paper
using title or author from metadata in brackets.

Context:
2. Analyze the resulting dimensions: verify orthogonality, identify potential merges,
and justify the final structure.
3. Provide the final taxonomy as a list of dimension labels only.
Prompt Template 2: Product Attribute Taxonomy Construction
Task:You are designing a product category taxonomy for an e-commerce platform.
Given the category name and a collection of merchant-provided product attributes
(key-value specifications), construct a structured attribute taxonomy by clustering se-
mantically related attributes into orthogonal dimensions from a product categorization
perspective.Procedure:
1. Iterate through product attributes sequentially. For each attribute, analyze whether
it describes the same underlying product aspect as any existing dimension based
on semantic

In [None]:
# --- STEP 11: VERIFY RETRIEVED SOURCES ---
# We iterate through the source documents retrieved by the system 
# to verify exactly which papers were used to generate the answer.


print("\nSOURCES USED:\n")

for i, doc in enumerate(result["source_documents"]):
    meta = doc.metadata
    print(
        f"[{i+1}]",
        meta.get("title", "No title"),
        "|",
        meta.get("source", "")
    )



SOURCES USED:

[1] No title | ./2512.21257v1.ReaSeq__Unleashing_World_Knowledge_via_Reasoning_for_Sequential_Modeling.pdf
[2] No title | ./2512.21257v1.ReaSeq__Unleashing_World_Knowledge_via_Reasoning_for_Sequential_Modeling.pdf
[3] No title | ./2512.20949v1.Neural_Probe_Based_Hallucination_Detection_for_Large_Language_Models.pdf
[4] No title | ./2512.20950v1.MultiMind_at_SemEval_2025_Task_7__Crosslingual_Fact_Checked_Claim_Retrieval_via_Multi_Source_Alignment.pdf
[5] No title | ./2512.21107v1.Semi_Supervised_Learning_for_Large_Language_Models_Safety_and_Content_Moderation.pdf
