<a href="https://colab.research.google.com/github/SadeghMahmoudAbadi/Open-Source-LLM-on-Colab/blob/main/6-RAG/ingest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install litellm
!pip install chromadb
!pip install ollama
!pip install sentence-transformers

In [44]:
import os
from google.colab import userdata
from google.colab import drive
from pathlib import Path
from pydantic import BaseModel, Field
from chromadb import PersistentClient
from tqdm import tqdm
from litellm import completion
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from ollama import Client
import pickle
from sentence_transformers import SentenceTransformer


In [45]:
os.environ["OPENROUTER_API_KEY"] = userdata.get("OPENROUTER_API_KEY")

messages = [{"role": "user", "content": "Tell a light-hearted joke."}]

In [47]:
response = completion(
    model="openrouter/x-ai/grok-4.1-fast",
    messages=messages
)

print(response.choices[0].message.content)

Why did the scarecrow win an award? Because he was outstanding in his field! ðŸ˜„


In [9]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [48]:
DB_NAME = "/content/drive/MyDrive/datasets/preprocessed_db"
collection_name = "docs"
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
KNOWLEDGE_BASE_PATH = Path("/content/drive/MyDrive/datasets/knowledge-base")
AVERAGE_CHUNK_SIZE = 500
MODEL="openrouter/x-ai/grok-4.1-fast"

In [49]:
class Result(BaseModel):
    page_content: str
    metadata: dict

In [50]:
class Chunk(BaseModel):
    headline: str = Field(description="A brief heading for this chunk, typically a few words, that is most likely to be surfaced in a query")
    summary: str = Field(description="A few sentences summarizing the content of this chunk to answer common questions")
    original_text: str = Field(description="The original text of this chunk from the provided document, exactly as is, not changed in any way")

    def as_result(self, document):
        metadata = {"source": document["source"], "type": document["type"]}
        return Result(page_content=self.headline + "\n\n" + self.summary + "\n\n" + self.original_text,metadata=metadata)


class Chunks(BaseModel):
    chunks: list[Chunk]

In [51]:
def fetch_documents():
    """A homemade version of the LangChain DirectoryLoader"""

    documents = []

    for folder in KNOWLEDGE_BASE_PATH.iterdir():
        doc_type = folder.name
        for file in folder.rglob("*.md"):
            with open(file, "r", encoding="utf-8") as f:
                documents.append({"type": doc_type, "source": file.as_posix(), "text": f.read()})

    print(f"Loaded {len(documents)} documents")
    return documents

In [52]:
documents = fetch_documents()

Loaded 76 documents


In [53]:
def make_prompt(document):
    how_many = (len(document["text"]) // AVERAGE_CHUNK_SIZE) + 1
    return f"""
You take a document and you split the document into overlapping chunks for a KnowledgeBase.

The document is from the shared drive of a company called Insurellm.
The document is of type: {document["type"]}
The document has been retrieved from: {document["source"]}

A chatbot will use these chunks to answer questions about the company.
You should divide up the document as you see fit, being sure that the entire document is returned in the chunks - don't leave anything out.
This document should probably be split into {how_many} chunks, but you can have more or less as appropriate.
There should be overlap between the chunks as appropriate; typically about 25% overlap or about 50 words, so you have the same text in multiple chunks for best retrieval results.

For each chunk, you should provide a headline, a summary, and the original text of the chunk.
Together your chunks should represent the entire document with overlap.

Here is the document:

{document["text"]}

Respond with the chunks.
"""

In [54]:
# print(make_prompt(documents[0]))

In [55]:
def make_messages(document):
    return [
        {"role": "user", "content": make_prompt(document)},
    ]

In [56]:
# make_messages(documents[0])

In [57]:
def process_document(document):
    messages = make_messages(document)
    response = completion(
        model=MODEL,
        messages=messages,
        response_format=Chunks
    )
    reply = response.choices[0].message.content
    doc_as_chunks = Chunks.model_validate_json(reply).chunks
    return [chunk.as_result(document) for chunk in doc_as_chunks]

In [58]:
# process_document(documents[0])

In [59]:
def create_chunks(documents):
    chunks = []
    for doc in tqdm(documents):
        chunks.extend(process_document(doc))
    return chunks

In [60]:
file_path = '/content/drive/MyDrive/datasets/chunk_list.pkl'

In [61]:
# chunks = create_chunks(documents)

# with open(file_path, 'wb') as f:
#         pickle.dump(chunks, f)

In [25]:
with open(file_path, 'rb') as f:
    chunks = pickle.load(f)

len(chunks)

653

In [62]:
def create_embeddings(chunks):
    chroma = PersistentClient(path=DB_NAME)
    if collection_name in [c.name for c in chroma.list_collections()]:
        chroma.delete_collection(collection_name)

    texts = [chunk.page_content for chunk in chunks]
    emb = SentenceTransformer(embedding_model).encode(texts)
    vectors = [e for e in emb]

    collection = chroma.get_or_create_collection(collection_name)

    ids = [str(i) for i in range(len(chunks))]
    metas = [chunk.metadata for chunk in chunks]

    collection.add(ids=ids, embeddings=vectors, documents=texts, metadatas=metas)
    print(f"Vectorstore created with {collection.count()} documents")

In [43]:
create_embeddings(chunks)

Vectorstore created with 653 documents
