In [1]:
import os
from dotenv import load_dotenv
from pathlib import Path
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage

In [2]:
env_path = Path.cwd() / ".env"    # output for tis line ==> C:\Users\Admin\Data Science\rag\.env
load_dotenv(env_path)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY is not set")

In [3]:
PERSIST_DIR = os.path.join("db", "chroma_db")

if os.path.isdir(PERSIST_DIR):
    print(f"Directory already available: {PERSIST_DIR}")
else:
    os.makedirs(PERSIST_DIR, exist_ok=True)
    print(f"Directory created successfully: {PERSIST_DIR}")


Directory already available: db\chroma_db


In [4]:
embedding_model = OpenAIEmbeddings(
    model="text-embedding-3-small",
    api_key=OPENAI_API_KEY
)

In [5]:
db = Chroma(
    persist_directory=PERSIST_DIR,
    embedding_function=embedding_model,
    collection_metadata={"hnsw:space":"cosine"}
)

In [6]:
# user query
query = "How much did Microsoft pay to acquire GitHub?"

In [7]:
# create retriever
retriever = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs = {
        "k":3,
        "score_threshold":0.3   # Only return chunks with cosine similarity ≥ 0.3
    }
)

In [8]:
# search for relevant document
relevant_docs = retriever.invoke(query)

print(f"User Query: {query}")

# Display results
print("--- Context ---")
for i, doc in enumerate(relevant_docs, 1):
    print(f"Document {i}:\n{doc.page_content}\n")

User Query: How much did Microsoft pay to acquire GitHub?
--- Context ---
Document 1:
119. "Microsoft completes GitHub acquisition" (https://web.archive.org/web/20190112212059/http
s://www.msn.com/en-us/news/technology/microsoft-completes-github-acquisition/ar-BBOVV
OT). www.msn.com. Archived from the original (https://www.msn.com/en-us/news/technolog
y/microsoft-completes-github-acquisition/ar-BBOVVOT) on January 12, 2019. Retrieved
April 10, 2019.

Document 2:
117. "Microsoft's 2018, part 1: Open source, wobbly Windows and everyone's going to the cloud"
(https://www.theregister.co.uk/2018/12/25/microsoft_year_in_review_2018/). The Register.
Archived (https://web.archive.org/web/20190103060059/https://www.theregister.co.uk/2018/
12/25/microsoft_year_in_review_2018/) from the original on January 3, 2019. Retrieved
January 3, 2019.

118. "Microsoft to acquire GitHub for $7.5 billion" (https://news.microsoft.com/2018/06/04/microso
ft-to-acquire-github-for-7-5-billion/). Microsoft. June 4

In [9]:
# combine the query and relevant documents

combined_input = f"""
Based on the following documents please answer this question: {query}

Documents:
{chr(10).join([f"-{doc.page_content}" for doc in relevant_docs])}

Please provide a clear, helpful answer using only the information from these documents. if you can't find the answer in the documents, say "I don't have enough information to answer that question based on the provided documents." 
"""

In [10]:
# create a ChatOpenAI model
model = ChatOpenAI(model="gpt-5-nano")

# Define the message for the model
message = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content=combined_input),
]

In [11]:
# invoke the model with combined input
result = model.invoke(message)

In [12]:
# display the full result
print("FUll Result: ")
print(result)

FUll Result: 
content='Microsoft paid $7.5 billion (USD) to acquire GitHub. The deal was announced on June 4, 2018 and closed on October 26, 2018.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 431, 'prompt_tokens': 651, 'total_tokens': 1082, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 384, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-5-nano-2025-08-07', 'system_fingerprint': None, 'id': 'chatcmpl-D1dELHIB5QzYyCkBXtaa8Gc1LZD2i', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='lc_run--019bf165-2e84-71e2-85a5-e8df85d93512-0' tool_calls=[] invalid_tool_calls=[] usage_metadata={'input_tokens': 651, 'output_tokens': 431, 'total_tokens': 1082, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 384}}


In [14]:
# display the content only
print("Content Only: ")
print(result.content)

Content Only: 
Microsoft paid $7.5 billion (USD) to acquire GitHub. The deal was announced on June 4, 2018 and closed on October 26, 2018.


In [33]:
from textwrap import dedent

def build_rag_prompt(query, max_chars_per_doc=1500):
    """
    Build a safe and deterministic RAG prompt.
    - Handles empty inputs
    - Truncates long documents
    - Produces clean formatting for LLMs
    """
    relevant_docs = retriever.invoke(query)

    if not query or not query.strip():
        raise ValueError("Query must be a non-empty string")

    if not relevant_docs:
        documents_block = "No documents were retrieved."
    else:
        cleaned_docs = []
        for i, doc in enumerate(relevant_docs, start=1):
            content = getattr(doc, "page_content", "")
            content = content.strip().replace("\n", " ")
            content = content[:max_chars_per_doc]
            cleaned_docs.append(f"{i}. {content}")

        documents_block = "\n".join(cleaned_docs)

    prompt = dedent(f"""
        You are given a question and a set of documents.

        Question:
        {query}

        Documents:
        {documents_block}

        Instructions:
        - Answer using ONLY the information in the documents.
        - Do NOT use external knowledge.
        - If the answer cannot be found, respond exactly with:
          "I don't have enough information to answer that question based on the provided documents."
    """).strip()

    return prompt

In [34]:
def run_llm(
    combined_input: str,
    model_name: str = "gpt-5-nano",
    temperature: float = 0.0,
    max_tokens: int = 512,
):
    """
    Safely invoke a ChatOpenAI model with strong defaults.
    """

    if not combined_input or not combined_input.strip():
        raise ValueError("combined_input must be a non-empty string")

    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise EnvironmentError("OPENAI_API_KEY is not set")

    llm = ChatOpenAI(
        model=model_name,
        temperature=temperature,
        max_tokens=max_tokens,
        api_key=api_key,
    )

    messages = [
        SystemMessage(content="You are a helpful assistant."),
        HumanMessage(content=combined_input),
    ]

    try:
        response = llm.invoke(messages)
    except Exception as e:
        raise RuntimeError(f"LLM invocation failed: {e}")

    return response

In [37]:
user_query = "What was NVIDIA's first graphics accelerator called?"
combined_input = build_rag_prompt(query=user_query)
print(combined_input)

You are given a question and a set of documents.

        Question:
        What was NVIDIA's first graphics accelerator called?

        Documents:
        1. First graphics accelerator Nvidia's first graphics accelerator, the NV1, was designed to process quadrilateral primitives (forward texture mapping), a feature that set it apart from competitors, who preferred triangle primitives.[26]  However, when Microsoft introduced the DirectX platform, it chose not to support any other graphics software and announced that its Direct3D API would exclusively support triangles.[26][35] As a result, the NV1 failed to gain traction in the market.[36]
2. In late 1999, Nvidia released the GeForce 256 (NV10), its first product expressly marketed as a GPU, which was most notable for introducing onboard transformation and lighting (T&L) to consumer-level 3D hardware. Running at 120 MHz and featuring four-pixel pipelines, it implemented advanced video acceleration, motion compensation, and hardware su

In [38]:
result = run_llm(combined_input)

print("--- Generated Response ---")
print(result.content)

--- Generated Response ---
The NV1.


In [39]:
user_query = "How much did Microsoft pay to acquire GitHub?"
combined_input = build_rag_prompt(query=user_query)
result = run_llm(combined_input)
print(result.content)

$7.5 billion.
