# Quickstart notebook to load the Wiki RAG and use it

In [30]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Download RAG
from huggingface_hub import snapshot_download
from pathlib import Path
CWD = Path.cwd()

# specify the repo and folder
repo_id = "royrin/wiki-rag"
wiki_rag_data_dir = Path("wiki_rag_data")
# make dir
wiki_rag_data_dir.mkdir(parents=True, exist_ok=True)

faiss_name = "wiki_index__top_100000__2025-04-11"

# download the specific folder
local_dir = snapshot_download(
    repo_id=repo_id,
    repo_type="model",
    allow_patterns=[f"{faiss_name}/**"],
    local_dir=f"./{wiki_rag_data_dir}",
    local_dir_use_symlinks=False
)

print(f"Downloaded to {local_dir}")
print(f"local_dir: {local_dir}")

faiss_path = Path(local_dir)/ faiss_name

For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.
Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 2567.68it/s]

Downloaded to /Users/roy/code/research/unlearning/data_to_concept_unlearning/wiki-rag/notebooks/wiki_rag_data
local_dir: /Users/roy/code/research/unlearning/data_to_concept_unlearning/wiki-rag/notebooks/wiki_rag_data





In [26]:
class PromptedBGE(HuggingFaceEmbeddings):
    def embed_documents(self, texts):
        return super().embed_documents([
            f"Represent this document for retrieval: {t}" for t in texts
        ])

    def embed_query(self, text):
        return super().embed_query(f"Represent this query for retrieval: {text}")
# BAAI_embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")

BAAI_embedding = PromptedBGE(model_name="BAAI/bge-base-en")  # or bge-large-en

In [34]:
vectorstore = FAISS.load_local(faiss_path, BAAI_embedding,
                                   allow_dangerous_deserialization=True)

# example RAG query

In [39]:
user_query = "Biology"
response = vectorstore.similarity_search(user_query, k=10)
for i, result in enumerate(response[:10]):
    title = result.metadata["title"]
    content = result.page_content
    print(f"{i+1}. {title}\n\t{content[:50]}...\n")
 

1. Biology
	Biology is the scientific study of life. It is a n...

2. DAVID
	DAVID (the database for annotation, visualization ...

3. Protein
	Proteins are large biomolecules and macromolecules...

4. PANTHER
	In bioinformatics, the PANTHER (protein analysis t...

5. Metabolism
	Metabolism (, from "metabolē", "change") is the se...

6. Carbohydrate
	A carbohydrate () is a biomolecule composed of car...

7. Taxonomy (biology)
	In biology, taxonomy () is the scientific study of...

8. DNA and RNA codon tables
	A codon table can be used to translate a genetic c...

9. Anatomy
	Anatomy () is the branch of morphology concerned w...

10. Collagen
	Collagen () is the main structural protein in the ...

