## 1. <font color = red> Install and Import the Required Libraries

Step 1: import the required packages

In [31]:
import os
import subprocess
#import sentence_transformers
import chromadb
from sentence_transformers import SentenceTransformer


# CONFIGURATION

TEXT_FOLDER = r"C:\Rag\rag files"  # folder with your KB text files
OLLAMA_PATH = r"C:\Users\DLP-I516-156\AppData\Local\Programs\Ollama\Ollama.exe"
LLAMA_MODEL = "llama3.2:1b"

TOP_K = 3
CONFIDENCE_THRESHOLD = 0.3  # cosine similarity threshold


In [None]:
all_documents = []
text_files = [f for f in os.listdir(TEXT_FOLDER) if f.lower().endswith(".txt")]

for file_name in text_files:
    full_path = os.path.join(TEXT_FOLDER, file_name)
    with open(full_path, "r", encoding="utf-8") as f:
        text = f.read()
        all_documents.append({
            "file_name": file_name,
            "text": text
        })

print("Loaded files:")
for d in all_documents:
    print(" -", d["file_name"])


Loaded files:
 - File 1.txt
 - File 2.txt
 - File 3.txt
 - File 4.txt
 - File 5.txt


In [46]:
# Print the first 5 loaded files
for doc in all_documents[:5]:
    print(f"File Name: {doc['file_name']}")
    print(f"Text Preview: {doc['text'][:500]}")  # first 500 characters
    print("-" * 80)


File Name: File 1.txt
Text Preview: Machine Learning: An Introduction

Machine Learning (ML) is a branch of artificial intelligence that enables systems to learn from data, identify patterns, and make decisions with minimal human intervention. Unlike traditional programming, where explicit instructions are provided to perform a task, machine learning allows computers to develop their own logic based on experience. This capability has transformed numerous industries, from healthcare to finance, and continues to influence how techno
--------------------------------------------------------------------------------
File Name: File 2.txt
Text Preview: Retrieval-Augmented Generation (RAG): An Overview

Retrieval-Augmented Generation, commonly referred to as RAG, is an innovative approach in the field of natural language processing that combines the strengths of information retrieval systems with the generative capabilities of large language models. Traditional generative models, such as GPT o

Step 2: Chunk the text

In [47]:
import re

CHUNK_SIZE = 500

def chunk_text(text):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current = ""
    para_no = 1

    for s in sentences:
        if len(current) + len(s) <= CHUNK_SIZE:
            current += " " + s
        else:
            chunks.append((current.strip(), para_no))
            para_no += current.count("\n") + 1
            current = s

    if current.strip():
        chunks.append((current.strip(), para_no))

    return chunks


chunked_documents = []

for doc in all_documents:
    chunks = chunk_text(doc["text"])
    for idx, (chunk, para_no) in enumerate(chunks):
        chunked_documents.append({
            "file_name": doc["file_name"],
            "chunk_id": idx,
            "para_no": para_no,
            "text": chunk
        })

print("Total chunks:", len(chunked_documents))


Total chunks: 69


In [48]:
for chunk in chunked_documents[:5]:
    print(f"File: {chunk['file_name']}, Chunk ID: {chunk['chunk_id']}, Para: {chunk['para_no']}")
    print(f"Text Preview: {chunk['text']}")
    print("-" * 80)


File: File 1.txt, Chunk ID: 0, Para: 1
Text Preview: Machine Learning: An Introduction

Machine Learning (ML) is a branch of artificial intelligence that enables systems to learn from data, identify patterns, and make decisions with minimal human intervention. Unlike traditional programming, where explicit instructions are provided to perform a task, machine learning allows computers to develop their own logic based on experience.
--------------------------------------------------------------------------------
File: File 1.txt, Chunk ID: 1, Para: 4
Text Preview: This capability has transformed numerous industries, from healthcare to finance, and continues to influence how technology interacts with our daily lives. At its core, machine learning relies on data. The quality, volume, and diversity of the data directly impact how well a model performs. Data is used to train algorithms, which then make predictions or identify trends when presented with new information.
----------------------

Step 3: Create embeddings & Chroma DB

In [49]:
embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")

chroma_client = chromadb.Client(
    chromadb.config.Settings(
        persist_directory="chroma_store"
    )
)

collection = chroma_client.get_or_create_collection(
    name="knowledge_base"
)

documents = [d["text"] for d in chunked_documents]
metadatas = [
    {
        "file_name": d["file_name"],
        "para_no": d["para_no"]
    }
    for d in chunked_documents
]

ids = [f"doc_{i}" for i in range(len(documents))]

embeddings = embedding_model.encode(
    documents,
    normalize_embeddings=True
).tolist()

collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids,
    embeddings=embeddings
)

print("Vectors stored:", collection.count())


Loading weights: 100%|██████████| 199/199 [00:00<00:00, 598.16it/s, Materializing param=pooler.dense.weight]                               
BertModel LOAD REPORT from: BAAI/bge-base-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Vectors stored: 117


In [50]:
def retrieve_context(query):
    q_emb = embedding_model.encode(query, normalize_embeddings=True).tolist()

    results = collection.query(
        query_embeddings=[q_emb],
        n_results=TOP_K
    )

    docs, metas, scores = [], [], []

    for d, m, dist in zip(
        results["documents"][0],
        results["metadatas"][0],
        results["distances"][0]
    ):
        sim = 1 - dist
        if sim >= CONFIDENCE_THRESHOLD:
            docs.append(d)
            metas.append(m)
            scores.append(sim)

    print("Similarity scores:", scores)
    return docs, metas


In [51]:
def build_prompt(query, docs, metas):
    context = ""

    for d, m in zip(docs, metas):
        context += (
            f"File: {m['file_name']}, Paragraph: {m['para_no']}\n"
            f"{d}\n\n---\n\n"
        )

    if not context.strip():
        return None

    return f"""
You are a knowledge base assistant.

Answer strictly using the context below.
- Cite file name and paragraph number
- Do NOT add new information
- If missing, say "Information not found."

Context:
{context}

Question:
{query}

Answer:
"""


In [52]:
def call_ollama(prompt):
    if not prompt:
        return "Information not found."

    result = subprocess.run(
        [OLLAMA_PATH, "run", LLAMA_MODEL],
        input=prompt,
        text=True,
        capture_output=True,
        timeout=60
    )

    return result.stdout.strip()


In [53]:
query = "What is the stock market?"

docs, metas = retrieve_context(query)

prompt = build_prompt(query, docs, metas)

print("\n----- PROMPT SENT TO OLLAMA -----\n")
print(prompt)

final_answer = call_ollama(prompt)

print("\n✅ Final Answer:\n")
print(final_answer)


Similarity scores: [0.7282564043998718, 0.5078592598438263, 0.4550701975822449]

----- PROMPT SENT TO OLLAMA -----


You are a knowledge base assistant.

Answer strictly using the context below.
- Cite file name and paragraph number
- Do NOT add new information
- If missing, say "Information not found."

Context:
File: File 4.txt, Paragraph: 1
Understanding the Stock Market

The stock market is a complex and dynamic financial system where investors buy and sell shares of publicly traded companies. It plays a crucial role in the global economy by allowing companies to raise capital and providing individuals and institutions with opportunit

---

File: File 4.txt, Paragraph: 3
ested in finance or investing.

At its core, the stock market operates as a marketplace for buying and selling shares, which represent partial ownership in a company. Companies issue shares to raise funds for expansion, research, debt repayment, or other corporate activities. Investors purchase thes

---

File: Fil

Exception in thread Thread-19 (_readerthread):
Traceback (most recent call last):
  File "C:\Users\DLP-I516-156\AppData\Local\Programs\Python\Python313\Lib\threading.py", line 1041, in _bootstrap_inner
    self.run()
    ~~~~~~~~^^
  File "C:\Users\DLP-I516-156\AppData\Local\Programs\Python\Python313\Lib\threading.py", line 992, in run
    self._target(*self._args, **self._kwargs)
    ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DLP-I516-156\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 1609, in _readerthread
    buffer.append(fh.read())
                  ~~~~~~~^^
  File "C:\Users\DLP-I516-156\AppData\Local\Programs\Python\Python313\Lib\encodings\cp1252.py", line 23, in decode
    return codecs.charmap_decode(input,self.errors,decoding_table)[0]
           ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeDecodeError: 'charmap' codec can't decode byte 0x8f in position 332: character maps to <undefined>



✅ Final Answer:

File: File 4.txt, Paragraph: 1
The stock market is a complex and dynamic financial system where investors buy and sell shares of publicly traded companies. It plays a crucial role in the global economy by allowing companies to raise capital and providing individuals and institutions with opportunities for investment.


In [None]:
query = "What is the solar system?"

docs, metas = retrieve_context(query)

prompt = build_prompt(query, docs, metas)

print("\n----- PROMPT SENT TO OLLAMA -----\n")
print(prompt)

final_answer = call_ollama(prompt)

print("\n✅ Final Answer:\n")
print(final_answer)

Similarity scores: []

----- PROMPT SENT TO OLLAMA -----

None

✅ Final Answer:

Information not found.


In [None]:
query = input('Enter: ').strip()

if query:
    retrieval = retrieve_context(query)
    docs = retrieval["documents"][0]
    metas = retrieval["metadatas"][0]

    if not docs:
        print("Information not found.")
    else:
        prompt = build_prompt(query, docs, metas)
        print(call_ollama(prompt).strip())
