# Simple RAG Example with Weaviate and LangChain

## WSL and Shell Command Helpers

In [1]:
import platform
import subprocess
import os

# --- WSL Detection ---
system = platform.system()
USE_WSL = system == "Windows"
print(f"Operating System: {system}. Using WSL for Docker commands: {USE_WSL}")

# --- Shell Command Helpers ---
def run_wsl_command(command):
    """Executes a command inside WSL and returns the result."""
    result = subprocess.run(
        ["wsl", "-e", "bash", "-l", "-c", command],
        capture_output=True,
        text=True,
        encoding="utf-8",
        errors="replace"
    )
    return {
        "returncode": result.returncode,
        "stdout": result.stdout.strip(),
        "stderr": result.stderr.strip(),
        "success": result.returncode == 0
    }

def run_linux_command(command):
    """Executes a command in a standard Linux/macOS shell."""
    result = subprocess.run(
        command,
        shell=True,
        capture_output=True,
        text=True,
        encoding="utf-8",
        errors="replace"
    )
    return {
        "returncode": result.returncode,
        "stdout": result.stdout.strip(),
        "stderr": result.stderr.strip(),
        "success": result.returncode == 0
    }

def run_shell_command(command):
    """Universal function to run a shell command, abstracting WSL usage."""
    if USE_WSL:
        return run_wsl_command(command)
    else:
        return run_linux_command(command)

print("✅ Shell command helpers are defined.")

Operating System: Windows. Using WSL for Docker commands: True
✅ Shell command helpers are defined.


## Configuration

In [1]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
HF_API_TOKEN = os.environ["HUGGINGFACE_API_TOKEN"]

# Embeddingd model for local run.
# If you have access to Gemma (you logged in via huggingface-cli), use: "google/embeddinggemma-300m" (768 dimensions)
# If you don't have access or encounter errors, use the standard one: "all-MiniLM-L6-v2" (384 dimensions)
LOCAL_EMBEDDING_MODEL_NAME = "google/embeddinggemma-300m"
# LOCAL_EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" # Uncomment if Gemma doesn't work


# Text generation model for local run.
LOCAL_LLM_MODEL_NAME = "google/gemma-3-1b-it"


# --- VECTOR DATABASE CONFIGURATION ---
WEAVIATE_CONTAINER_NAME = "simple-rag-weaviate"
WEAVIATE_IMAGE = "semitechnologies/weaviate:1.33.7"
WEAVIATE_HTTP_PORT = 8080
WEAVIATE_GRPC_PORT = 50051

print("✅ Configuration loaded.")

✅ Configuration loaded.


In [2]:
from huggingface_hub import login

login(token=HF_API_TOKEN)
print("Successfully logged in to Hugging Face!")

  from .autonotebook import tqdm as notebook_tqdm


Successfully logged in to Hugging Face!


## Data Generation

In [3]:
from llama_index.readers.file import PDFReader

pdf_paths = [
    "C:\\Users\\tomir\\Desktop\\EPAM\\epam_train\\Module 4\\data\\toddler_AAP.pdf",
    "C:\\Users\\tomir\\Desktop\\EPAM\\epam_train\\Module 4\\data\\TheultimatelistofMontessoriactivitiesforbabiestoddlersandpreschoolers.pdf"
]

reader = PDFReader()
documents = []

for path in pdf_paths:
    docs = reader.load_data(file=path)
    for d in docs:
        d.metadata["source_file"] = os.path.basename(path)
    documents.extend(docs)

print(f"Loaded {len(documents)} pages")

Loaded 155 pages


In [7]:
documents

[Document(id_='ae58c5a7-a4d9-404e-9e76-023f7a254d0c', embedding=None, metadata={'page_label': '1', 'file_name': 'TheultimatelistofMontessoriactivitiesforbabiestoddlersandpreschoolers.pdf', 'source_file': 'TheultimatelistofMontessoriactivitiesforbabiestoddlersandpreschoolers.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='The ultimate list \nof Montessori activities \nfor babies, toddlers and \npreschoolers \nby \nThe Montessori  \nNotebook', path=None, url=None, mimetype=None), image_resource=None, audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{content}'),
 Document(id_='0aabf162-a05c-43c4-a902-08685eeabce8', embedding=None, metadata={'page_label': '2', 'file_name': 'TheultimatelistofMontessoriactivitiesforbabiestoddlersandpreschoolers.pdf', 'source_file': 'TheultimatelistofMontessoriactivitiesfo

## Embeddings and Data Ingestion

In [4]:
from langchain_core.messages import AIMessage
from langchain_core.runnables import Runnable, RunnableConfig
import weaviate
import weaviate.classes as wvc
from weaviate.util import generate_uuid5
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import torch
import numpy as np

# --- Wrapper class for the local Embeddings model ---
class LocalHuggingFaceEmbeddings:
    """
    This class adapts a local SentenceTransformer model
    to the LangChain interface, which expects the methods embed_documents and embed_query.
    """
    def __init__(self, model_name):
        print(f"📥 Loading local embedding model: {model_name}...")
        try:
            self.model = SentenceTransformer(model_name)
            print("✅ Local embedding model loaded successfully.")
        except Exception as e:
            print(f"❌ Error loading {model_name}. Falling back to 'all-MiniLM-L6-v2'.")
            print(f"Error details: {e}")
            self.model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

    def embed_documents(self, texts):
        # Returns a list of lists
        embeddings = self.model.encode(texts, convert_to_numpy=True)
        return embeddings.tolist()

    def embed_query(self, text):
        # Returns a single list
        embedding = self.model.encode(text, convert_to_numpy=True)
        return embedding.tolist()

In [5]:
# --- 1. Setup LangChain Clients ---
print("--- 1. Setting up AI clients ---")
try:
    # Embedding Model Setup
    embeddings_model = LocalHuggingFaceEmbeddings(LOCAL_EMBEDDING_MODEL_NAME)

    # # Chat Model Setup
    # chat_model = LocalHuggingFaceChatModel(LOCAL_LLM_MODEL_NAME)
    # print("✅ AI clients initialized.")

except Exception as e:
    print(f"❌ Failed to initialize AI clients. Please check your .env file or model names. Error: {e}")
    # Stop execution if clients fail to initialize
    raise

2025-12-21 18:52:44,242 - INFO - Use pytorch device_name: cpu
2025-12-21 18:52:44,242 - INFO - Load pretrained SentenceTransformer: google/embeddinggemma-300m


--- 1. Setting up AI clients ---
📥 Loading local embedding model: google/embeddinggemma-300m...


2025-12-21 18:52:53,965 - INFO - 14 prompts are loaded, with the keys: ['query', 'document', 'BitextMining', 'Clustering', 'Classification', 'InstructionRetrieval', 'MultilabelClassification', 'PairClassification', 'Reranking', 'Retrieval', 'Retrieval-query', 'Retrieval-document', 'STS', 'Summarization']


✅ Local embedding model loaded successfully.


In [6]:
# paragraph chunking 
def paragraph_chunking(text, min_length=200):
    paragraphs = [p.strip() for p in text.split("\n\n") if len(p.strip()) >= min_length]
    return paragraphs

chunked_documents = []
chunk_id = 0

for doc in documents:
    paragraphs = paragraph_chunking(doc.text)

    for section_id, paragraph in enumerate(paragraphs):
        chunked_documents.append({
            "title": doc.metadata["source_file"],
            "content": paragraph,
            "chunk_id": chunk_id,
            "chunk_type": "paragraph",
            "section_id": section_id,
            "page_number": doc.metadata.get("page_label", None),
            "parent_doc": doc.metadata["source_file"]
        })
        chunk_id += 1

print(f"Created {len(chunked_documents)} paragraph chunks")


Created 152 paragraph chunks


In [7]:
# loading tables
import json

with open("C:\\Users\\tomir\\Desktop\\EPAM\\epam_train\\Module 4\\outputs\\tables.json", "r", encoding="utf-8") as f:
    tables = json.load(f)

print(type(tables))      
print(len(tables))       
print(tables[0].keys())  

<class 'list'>
50
dict_keys(['table_id', 'source', 'content'])


In [20]:
tables[0]['content']

[{'0': 'GRowtH A nd deveLoPMent\n345\nCognitive Milestones for Your Two- Year- Old\n■\n  Makes mechanical toys work\n■\n  Matches an object in her hand or room to a picture in a book\n■\n  Plays make- believe with dolls, animals, and people\n■\n  Sorts objects by shape and color\n■\n  Completes puzzles with three or four pieces\n■\n  Understands concept of “two”',
  '1': '',
  '2': '',
  '3': ''},
 {'0': '', '1': '', '2': '', '3': ''}]

In [8]:
def table_dicts_to_text(rows: list[dict]) -> str:
    if not rows:
        return ""

    headers = rows[0].keys()
    lines = [" | ".join(headers)]

    for row in rows:
        lines.append(" | ".join(str(row[h]) for h in headers))

    return "\n".join(lines)


In [9]:
# adding table chunks to chunked_documents

for table_text in tables:
    table_content = table_text['content']

    if isinstance(table_content, list) and isinstance(table_content[0], dict):
        table_content = table_dicts_to_text(table_content)

    chunked_documents.append({
        "title": table_text['source'],
        "content": "TABLE DATA:\n" + table_content,
        "chunk_id": chunk_id,
        "chunk_type": "table",
        "section_id": None,
        "page_number": None,
        "parent_doc": table_text['source']
    })
    chunk_id += 1

print("Tables extracted and added as chunks")


Tables extracted and added as chunks


In [10]:
# embed chunks

contents_to_embed = [doc["content"] for doc in chunked_documents]

vector_embeddings = embeddings_model.embed_documents(contents_to_embed)

for i, doc in enumerate(chunked_documents):
    doc["content_vector"] = vector_embeddings[i]

print("Embeddings generated")


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches: 100%|██████████| 7/7 [01:53<00:00, 16.21s/it]

Embeddings generated





In [11]:
# --- 3. Connect to Weaviate ---
print("\n--- 3. Connecting to Weaviate ---")
weaviate_client = weaviate.connect_to_local(
    host="localhost",
    port=WEAVIATE_HTTP_PORT,
    grpc_port=WEAVIATE_GRPC_PORT
)
if weaviate_client.is_ready():
    print("✅ Successfully connected to Weaviate.")
else:
    print("❌ Failed to connect to Weaviate.")
    weaviate_client.close()
    raise ConnectionError("Could not connect to Weaviate instance.")


--- 3. Connecting to Weaviate ---


2025-12-21 18:55:41,080 - INFO - HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
2025-12-21 18:55:41,134 - INFO - HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
2025-12-21 18:55:41,691 - INFO - HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
2025-12-21 18:55:41,956 - INFO - HTTP Request: GET http://localhost:8080/v1/.well-known/ready "HTTP/1.1 200 OK"


✅ Successfully connected to Weaviate.


In [12]:
# --- 4. Define and Create Weaviate Collection ---
COLLECTION_NAME = "SimpleRAG_v2"
print(f"\n--- 4. Creating Weaviate collection: '{COLLECTION_NAME}' ---")

# Delete collection if it already exists for a clean run
if weaviate_client.collections.exists(COLLECTION_NAME):
    weaviate_client.collections.delete(COLLECTION_NAME)
    print(f"Deleted existing collection '{COLLECTION_NAME}'.")

# Create new DB schema for our documents
rag_collection = weaviate_client.collections.create(
    name=COLLECTION_NAME,
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="chunk_type", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="section_id", data_type=wvc.config.DataType.INT),
        wvc.config.Property(name="page_number", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="parent_doc", data_type=wvc.config.DataType.TEXT),
    ],
    vector_config=wvc.config.Configure.Vectors.self_provided(
        vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
            distance_metric=wvc.config.VectorDistances.COSINE
        )
    )
)

print(f"✅ Collection '{COLLECTION_NAME}' created successfully.")



2025-12-21 18:55:54,225 - INFO - HTTP Request: GET http://localhost:8080/v1/schema/SimpleRAG_v2 "HTTP/1.1 200 OK"



--- 4. Creating Weaviate collection: 'SimpleRAG_v2' ---


2025-12-21 18:55:54,392 - INFO - HTTP Request: DELETE http://localhost:8080/v1/schema/SimpleRAG_v2 "HTTP/1.1 200 OK"


Deleted existing collection 'SimpleRAG_v2'.


2025-12-21 18:55:54,604 - INFO - HTTP Request: POST http://localhost:8080/v1/schema "HTTP/1.1 200 OK"


✅ Collection 'SimpleRAG_v2' created successfully.


In [13]:
# --- 5. Batch-Insert Chunked Data ---
with rag_collection.batch.dynamic() as batch:
    for doc in chunked_documents:
        full_content = f"""Document: {doc['parent_doc']}{doc['content']}""".strip()
        batch.add_object(
            properties={
                "title": doc["title"],
                "content": doc["content"],
                "chunk_type": doc["chunk_type"],
                "section_id": doc["section_id"],
                "page_number": doc["page_number"],
                "parent_doc": doc["parent_doc"]
            },
            vector=embeddings_model.embed_query(full_content),
            uuid=generate_uuid5(f"{doc['parent_doc']}_{doc['chunk_id']}")
        )


2025-12-21 18:56:09,002 - INFO - HTTP Request: GET http://localhost:8080/v1/schema/SimpleRAG_v2 "HTTP/1.1 200 OK"
2025-12-21 18:56:09,014 - INFO - HTTP Request: GET http://localhost:8080/v1/nodes "HTTP/1.1 200 OK"
Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.78it/s]
Batches:   0%|          | 0/1 [00:00<?, ?it/s]2025-12-21 18:56:10,036 - INFO - HTTP Request: GET http://localhost:8080/v1/nodes "HTTP/1.1 200 OK"
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.19it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.72it/s]2025-12-21 18:56:11,051 - INFO - HTTP Request: GET http://localhost:8080/v1/nodes "HTTP/1.1 200 OK"

Batches: 100%|██████████| 1/1 [00:00<00:00,  2.87it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.75it/s]
Batches:   0%|          | 0/1 [00:00<?, ?it/s]2025-12-21 18:56:12,065 - INFO - HTTP Request: GET http://localhost:8080/v1/nodes "HTTP/1.1 200 OK"
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.23it/s]
Batches:   0%|          | 0/1 [00:00<?, ?it/s]2025-12-21 18:56:13,086 - INFO - HTTP Request: GET http://localhost:8080/v1/nodes "HTTP/1.1 200 OK"
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.42it/s]
Batches: 100%|████████

In [14]:
len(rag_collection)

202

In [28]:
# Close the client connection
weaviate_client.close()