<a href="https://colab.research.google.com/github/Saladdine-MW/tp-rag-student-version/blob/main/TP_RAG_Saladdine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/Saladdine-MW/tp-rag-student-version.git

Cloning into 'tp-rag-student-version'...
remote: Enumerating objects: 78, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 78 (delta 0), reused 0 (delta 0), pack-reused 75 (from 1)[K
Receiving objects: 100% (78/78), 93.16 MiB | 15.19 MiB/s, done.


In [2]:
pip install langchain chromadb sentence-transformers pypdf

Collecting chromadb
  Downloading chromadb-0.6.2-py3-none-any.whl.metadata (6.8 kB)
Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.5-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (f

# Splitting

In [4]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.14-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.25.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-no

In [10]:
import os
from langchain.document_loaders import PyPDFLoader, TextLoader, DirectoryLoader
from langchain.document_loaders.base import BaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

data_dir = "tp-rag-student-version/data/"
vector_store_dir = "chroma_db"

def select_loader(file_path):
    _, ext = os.path.splitext(file_path.lower())
    if ext == '.pdf':
        return PyPDFLoader(file_path)
    elif ext in ['.txt', '.tex','.bib']:
        return TextLoader(file_path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")


def load_documents(directory):
    loader = DirectoryLoader(
        directory,
        glob="**/*.*",
        loader_cls=select_loader
    )
    return loader.load()

# Splitting
def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
    )
    return text_splitter.split_documents(documents)

# Create embeddings and store in ChromaDB
def index_documents(paragraphs, vector_store_path):
    embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base")
    vector_store = Chroma.from_documents(paragraphs, embedding=embeddings, persist_directory=vector_store_path)
    vector_store.persist()
    return vector_store

# Indexing
def main_indexing():
    print("Loading documents...")
    docs = load_documents(data_dir)
    print(f"Loaded {len(docs)} documents.")

    print("Splitting documents into paragraphs...")
    paragraphs = split_documents(docs)
    print(f"Split into {len(paragraphs)} paragraphs.")

    print("Indexing documents in ChromaDB...")
    vector_store = index_documents(paragraphs, vector_store_dir)
    print("Indexing complete.")
    return vector_store

# Run the indexing
vector_store = main_indexing()


Loading documents...
Loaded 600 documents.
Splitting documents into paragraphs...
Split into 16889 paragraphs.
Indexing documents in ChromaDB...
Indexing complete.


## Interrogation

In [14]:
# Query
def query_vector_store(query, vector_store, k=5):
    docs_and_scores = vector_store.similarity_search_with_score(query, k=k)
    return docs_and_scores

# Test
def main_querying():
    print("Loading vector store...")
    vector_store = Chroma(persist_directory=vector_store_dir, embedding_function=HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base"))

    query = "Explain the main contributions of quantum mechanics to physics."
    print(f"Query: {query}")

    print("Searching for relevant documents...")
    results = query_vector_store(query, vector_store)
    for i, (doc, score) in enumerate(results):
        print(f"\nResult {i+1}:")
        print(f"Document: {doc.page_content}")
        print(f"Score: {score}")

main_querying()


Loading vector store...
Query: Explain the main contributions of quantum mechanics to physics.
Searching for relevant documents...

Result 1:
Document: \subsection{Equivalence between quantum learning models}
Score: 0.3211580216884613

Result 2:
Document: has interesting consequences in quantum computing.
Score: 0.32325515151023865

Result 3:
Document: \subsection{Gibbs states of local Hamiltonians}
Score: 0.3354974091053009

Result 4:
Document: \textbf{Acknowledgements.} We thank Matthias Caro and  the anonymous reviews of Nature Reviews Physics for several comments improving the presentation of this work and Abhinav Deshpande for useful comments. We thank Iulia Georgescu for commissioning this survey for the Nature Reviews~Physics. AA acknowledges support through the NSF CAREER Award No. 2238836 and NSF award QCIS-FF: Quantum Computing \& Information Science Faculty Fellow at Harvard University (NSF 2013303).
Score: 0.342695415019989

Result 5:
Document: \subsection{Matrix product st

# RAG

In [None]:
from langchain.prompts import PromptTemplate

# prompt prototype
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=(
        "You are a knowledgeable assistant. Use the context below to answer the question.\n\n"
        "Context:\n{context}\n\n"
        "Question:\n{question}\n\n"
        "Answer:"
    )
)


In [None]:
%xterm
curl https://ollama.ai/install.sh | sh
ollama serve &
ollama run qwen2.5:14b

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOllama

# Load the LLM
llm = ChatOllama(model="qwen2.5:14b", temperature=0)

# Create the RAG chain
def create_rag_chain(vector_store, prompt_template):
    retriever = vector_store.as_retriever()
    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={"prompt": prompt_template},
    )

# Use the RAG chain to answer questions
def main_rag():
    print("Loading vector store...")
    vector_store = Chroma(persist_directory=vector_store_dir, embedding_function=HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base"))

    print("Creating RAG chain...")
    chain = create_rag_chain(vector_store, prompt_template)

    question = "What are the benefits of renewable energy?"
    print(f"Question: {question}")

    print("Getting answer from RAG...")
    answer = chain.run(question)
    print(f"Answer: {answer}")

main_rag()
