In [8]:
import os
import requests
import json
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any

# -------------------------------------------------
# CONFIG
# -------------------------------------------------
DATA_DIR = "data"
VECTOR_DB_DIR = "embeddings"
OLLAMA_URL = "http://host.docker.internal:11434"   # Allow Docker to talk to host Ollama
MODEL_NAME = "mistral"  # must match your installed Ollama model

# -------------------------------------------------
# STEP 1: LOAD DOCUMENTS
# -------------------------------------------------
docs = []
pdf_loader = PyMuPDFLoader("userguide.pdf")
y1731loader = PyMuPDFLoader("y1731.pdf")
cfm_pdf = PyMuPDFLoader("8021ag-2007.pdf")
docs.extend(y1731loader.load())
docs.extend(cfm_pdf.load())
docs.extend(pdf_loader.load())

md_loader = TextLoader("CFM_OAM.md", encoding="utf-8")
docs.extend(md_loader.load())

print(f"‚úÖ Loaded {len(docs)} documents")

# -------------------------------------------------
# STEP 2: SPLIT INTO CHUNKS
# -------------------------------------------------
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)
print(f"‚úÖ Split into {len(chunks)} chunks")

# -------------------------------------------------
# STEP 3: CREATE EMBEDDINGS (CPU for reliability)
# -------------------------------------------------
embedding_function = SentenceTransformerEmbeddings(
    model_name="all-MiniLM-L6-v2", model_kwargs={"device": "cpu"}
)

vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_function,
    persist_directory=VECTOR_DB_DIR
)
vectordb.persist()
print("‚úÖ Vector database created and saved!")

# -------------------------------------------------
# STEP 4: CUSTOM OLLAMA LLM WRAPPER (Direct API)
# -------------------------------------------------
class OllamaLLM(LLM):
    model: str = MODEL_NAME
    api_url: str = OLLAMA_URL

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        """Call the Ollama API directly"""
        response = requests.post(
            f"{self.api_url}/api/generate",
            json={"model": self.model, "prompt": prompt, "stream": False}
        )

        try:
            data = response.json()
            return data.get("response", "")
        except json.JSONDecodeError:
            print("‚ö†Ô∏è Invalid JSON response from Ollama:")
            print(response.text)
            return ""

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        return {"model": self.model}

    @property
    def _llm_type(self) -> str:
        return "ollama_api"


llm = OllamaLLM()

# -------------------------------------------------
# STEP 5: BUILD RETRIEVAL CHAIN
# -------------------------------------------------
retriever = vectordb.as_retriever(search_kwargs={"k": 3})
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

# -------------------------------------------------
# STEP 6: INTERACTIVE LOOP
# -------------------------------------------------
while True:
    query = input("\nAsk a question (or type 'exit'): ")
    if query.lower() == "exit":
        break

    result = qa_chain.invoke({"query": query})
    print("\nüß† Answer:")
    print(result["result"])

    print("\nüìö Sources:")
    for doc in result["source_documents"]:
        print("-", doc.metadata.get("source", "Unknown file"))


‚úÖ Loaded 1431 documents
‚úÖ Split into 3231 chunks
‚úÖ Vector database created and saved!



Ask a question (or type 'exit'):  Hi



üß† Answer:
 The provided context appears to be a list of acronyms and PDUs (Protocol Data Units) related to telecommunications, specifically in the context of Rec. ITU-T G.8013/Y.1731 (06/2023). Here's a list of some of them:

1. Multicast destination addresses
2. CCM - Congestion Control and Avoidance Mechanism
3. LBM - Link Bandwidth Management
4. LBR - Link Resource Manager
5. LTM - Link Traffic Management
6. LTR - Link Traffic Routing
7. AIS - Alarm Indication Signal
8. DMM - Downstream Multicast Management
9. DMR - Downstream Multicast Reporting
10. EXM - Explicit Multicast
11. EXR - Explicit Unicast with RSVP
12. VSM - Virtual Session Manager
13. VSR - Virtual Service Registration
14. CSF - Congestion Avoidance Signal
15. SLM - Service Level Management

In addition, there are several PDU types listed:

1. LCK PDU - Link Control Protocol Data Unit
2. TST PDU - Test Protocol Data Unit
3. APS PDU - Advanced Peer-to-Peer Signaling Protocol Data Unit
4. MCC PDU - Multicast Control 


Ask a question (or type 'exit'):  What is CFM



üß† Answer:
 CFM, based on the provided context, stands for Connection-Oriented Control Function Mechanism. It's a concept used in network communications, specifically in Multi-Protocol Label Switching (MPLS). The CFM aims to provide a mechanism to support fast rerouting and restoration of MPLS connections in case of failures or congestion. It operates within the Management Plane of an MPLS network, using Protocol Data Units (PDUs) for communication between different management points (MPs). However, for achieving its full potential, hardware modifications to existing Provider Bridges might be required, as mentioned in the text.

üìö Sources:
- 8021ag-2007.pdf
- 8021ag-2007.pdf
- 8021ag-2007.pdf



Ask a question (or type 'exit'):  what are the Y1731 features



üß† Answer:
 Based on the provided context, the number 617 is repeated three times as the "statically defined alarm type identifier." However, the context doesn't provide information about Y1731 features. The Y1731 is a standard for optical transport network alarms and faults, but without additional information, it's not possible to determine specific features related to this number 617.

üìö Sources:
- userguide.pdf
- userguide.pdf
- userguide.pdf



Ask a question (or type 'exit'):  exit


In [1]:
!ollama

/usr/bin/sh: 1: ollama: not found


In [6]:
import requests
import json

OLLAMA_URL = "http://host.docker.internal:11434"

# Send the request
response = requests.post(f"{OLLAMA_URL}/api/generate", json={
    "model": "mistral",
    "prompt": "Explain the importance of data normalization in machine learning."
}, stream=True)

# Ollama streams multiple JSON chunks
full_output = ""
for line in response.iter_lines():
    if line:
        data = json.loads(line.decode("utf-8"))
        if "response" in data:
            full_output += data["response"]
        elif data.get("done"):
            break

print("\nüß† Model Output:\n")
print(full_output)



üß† Model Output:

 Data normalization is a crucial pre-processing step in machine learning that aims to ensure all features or variables in a dataset are on a similar scale, reducing the impact of one feature dominating others and improving the performance and convergence speed of algorithms. Here's why it's important:

1. Algorithm Fairness: Machine learning algorithms tend to perform better with data that is evenly distributed across different ranges. If one attribute has a larger range of values, the algorithm may focus more on this feature, neglecting others that might be equally important. Normalization eliminates this bias by scaling all features to a common range.

2. Improved Learning: In some machine learning algorithms, especially those using Euclidean distance (like k-Nearest Neighbors or Support Vector Machines), the performance directly depends on the scale of features. Normalization ensures that the distances between data points are accurate and meaningful, improving t

In [13]:
# rag_with_cot_prompt.py
import os
import json
import requests
from typing import List

from langchain_community.document_loaders import PyMuPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings

# ---------------- CONFIG ----------------
DATA_DIR = "data"
VECTOR_DB_DIR = "embeddings"
PDF_FILES = ["userguide.pdf", "y1731.pdf", "8021ag-2007.pdf"]
MD_FILES = ["CFM_OAM.md", "cfm-debugging.md"]

# Ollama host reachable from inside Docker:
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://host.docker.internal:11434")
OLLAMA_MODEL = "mistral"  # ensure this model exists in your local ollama

# retrieval
TOP_K = 3

# ---------------- SYSTEM PROMPT (your COT policy) ----------------
SYSTEM_PROMPT = """You are a helpful, precise assistant specialized in using provided documents (CONTEXT) plus your internal knowledge when necessary. 

Rules:
1. ALWAYS consult the CONTEXT block first for factual answers. If the CONTEXT contains explicit text that answers the question, answer only from that information.
2. If the CONTEXT is insufficient or the user asks for theoretical/explanatory content, you may use your internal knowledge to answer ‚Äî but mark which parts come from CONTEXT and which parts are from your internal knowledge.
3. Do NOT reveal raw internal chain-of-thought. Instead provide a brief "Reasoning summary" (2‚Äì4 lines) that explains the key steps or assumptions you used to reach the conclusion.
4. For any command / RPC / exact-value request: prefer exact context matches. If the exact value or command is not present in CONTEXT, reply with "NOT FOUND IN CONTEXT" and then, only if the user asked to, provide a best-effort answer using internal knowledge labeled as such.
5. When you cite CONTEXT, include the document id or filename and a short quote or line reference.
6. When you produce code, commands, or RPC responses, return them in fenced code blocks and mark them clearly as `EXACT FROM CONTEXT` if pulled verbatim; otherwise label as `DERIVED` or `INTERNAL_KNOWLEDGE`.
7. Also use multiple context when producing RPC and LightSpan RPC to give correct RPC, Check it multiple times majorly when it is asked to fetch RPC
8. Do not disclose the internal documents in response.
"""

# ---------------- Helper: build prompt ----------------
def build_prompt(retrieved_docs: List, question: str) -> str:
    """
    Compose the full prompt with CONTEXT (verbatim retrieved passages),
    followed by the user's question and instructions for the model.
    """
    ctx_parts = []
    for i, d in enumerate(retrieved_docs, start=1):
        src = d.metadata.get("source", f"doc_{i}")
        snippet = d.page_content.strip()
        # keep each snippet verbatim and include small excerpt label
        ctx_parts.append(f"[Document {i}: {src}]\n{snippet}\n")

    ctx_block = "\n\n".join(ctx_parts)
    prompt = (
        f"{SYSTEM_PROMPT}\n\n"
        "CONTEXT:\n"
        "========\n"
        f"{ctx_block}\n"
        "END_CONTEXT\n"
        "========\n\n"
        "INSTRUCTIONS:\n"
        "1) Use the CONTEXT above as primary source for factual answers. If CONTEXT contains the direct answer, use it verbatim and label EXACT FROM CONTEXT.\n"
        "2) If the CONTEXT does not contain an answer, you may answer using internal knowledge, but label that content as INTERNAL_KNOWLEDGE.\n"
        "3) Provide:\n"
        "   a) A short direct answer (1-3 sentences).\n"
        "   b) If an exact command or RPC is requested and is found verbatim in the CONTEXT, show it in a fenced code block labeled EXACT FROM CONTEXT. If not present, print NOT FOUND IN CONTEXT.\n"
        "   c) A short 'Reasoning summary' (2-4 lines). Do NOT reveal chain-of-thought.\n"
        "   d) A 'Sources' list referencing the document names and quoted snippets.\n\n"
        f"QUESTION:\n{question}\n"
    )
    return prompt

# ---------------- Ollama calling util ----------------
def call_ollama(prompt: str, model: str = OLLAMA_MODEL, stream: bool = False) -> str:
    """
    Call the local Ollama generate API.
    If stream=False we expect a single JSON response; if stream=True we process JSONL.
    """
    url = f"{OLLAMA_HOST}/api/generate"
    payload = {"model": model, "prompt": prompt, "stream": stream}
    # Use a small timeout but can be increased for long responses
    resp = requests.post(url, json=payload, stream=stream, timeout=300)

    if stream:
        # stream mode: iterate JSONL lines and accumulate 'response' fields
        out = ""
        for line in resp.iter_lines():
            if not line:
                continue
            data = json.loads(line.decode("utf-8"))
            # Many Ollama responses include {"response": "...", ...}
            if "response" in data and data["response"]:
                out += data["response"]
            if data.get("done"):
                break
        return out
    else:
        # non-streaming: single JSON object
        try:
            data = resp.json()
        except Exception as e:
            # show raw text when JSON parse fails
            print("Ollama response (raw):", resp.text[:1000])
            raise
        return data.get("response", "")

# ---------------- Main flow ----------------
def main():
    # Load docs
    docs = []
    for PDF_FILE in PDF_FILES:
        if os.path.exists(PDF_FILE):
            pdf_loader = PyMuPDFLoader(PDF_FILE)
            docs.extend(pdf_loader.load())
    for MD_FILE in MD_FILES:
        if os.path.exists(MD_FILE):
            md_loader = TextLoader(MD_FILE, encoding="utf-8")
            docs.extend(md_loader.load())

    print(f"Loaded {len(docs)} documents")

    # Split
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = splitter.split_documents(docs)
    print(f"Split into {len(chunks)} chunks")

    # Embeddings (CPU) and Chroma vector DB
    embedding_function = SentenceTransformerEmbeddings(
        model_name="all-MiniLM-L6-v2", model_kwargs={"device": "cpu"}
    )

    vectordb = Chroma.from_documents(
        documents=chunks,
        embedding=embedding_function,
        persist_directory=VECTOR_DB_DIR
    )
    vectordb.persist()
    print("‚úÖ Vector DB created/persisted at", VECTOR_DB_DIR)

    # create a retriever
    retriever = vectordb.as_retriever(search_kwargs={"k": TOP_K})

    # interactive loop
    print("\nReady. Ask a question (type 'exit' to quit).")
    while True:
        q = input("\nQuestion: ").strip()
        if q.lower() in ("exit", "quit"):
            break

        # retrieve top-k passages
        retrieved = retriever.get_relevant_documents(q)
        # build prompt that enforces your chain-of-thought policy
        prompt = build_prompt(retrieved, q)

        # call ollama (non-streaming for simplicity; set stream=True to stream)
        try:
            answer = call_ollama(prompt, model=OLLAMA_MODEL, stream=False)
        except Exception as e:
            print("Error calling Ollama:", e)
            continue

        # print the model output
        print("\n=== MODEL ANSWER ===\n")
        print(answer)
        print("\n====================\n")

if __name__ == "__main__":
    main()


Loaded 1432 documents
Split into 3251 chunks
‚úÖ Vector DB created/persisted at embeddings

Ready. Ask a question (type 'exit' to quit).



Question:  Hello



=== MODEL ANSWER ===

 The command to display currently logged on users is "who". This command will mark the current session, which is running the show status command, with an asterisk.

The information provided is from Document 1, 2, and 3 of userguide.pdf. Command Description: Display currently logged on users.The current session, i.e. the session running the show status command, is marked with an asterisk (1007)

EXACT FROM CONTEXT:
```
who
```





Question:  What is CFM



=== MODEL ANSWER ===

 Short direct answer: CFM, as mentioned in the provided documents, appears to be a technology or protocol related to Communications Framework Management (CFM). It involves generating and absorbing Control and Management (CCM) packets.

Reasoning summary: The term "CFM" is repeatedly used throughout the context without an explicit definition. However, it can be inferred from the surrounding text that CFM refers to Communications Framework Management.

Sources:
- Document 1, line 16: generate and/or absorb these CCMs could be overwhelmed. To achieve its full potential, CFM could require hardware modifications to existing Provider Bridges.
- Document 2, line 16: generate and/or absorb these CCMs could be overwhelmed. To achieve its full potential, CFM could require hardware modifications to existing Provider Bridges.
- Document 3, line 16: generate and/or absorb these CCMs could be overwhelmed. To achieve its full potential, CFM could require hardware modifications 


Question:  exit


In [14]:
!pip install fastapi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting fastapi
  Downloading fastapi-0.120.3-py3-none-any.whl.metadata (28 kB)
Collecting starlette<0.50.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.49.1-py3-none-any.whl.metadata (6.4 kB)
Collecting annotated-doc>=0.0.2 (from fastapi)
  Downloading annotated_doc-0.0.3-py3-none-any.whl.metadata (6.6 kB)
Downloading fastapi-0.120.3-py3-none-any.whl (108 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m108.3/108.3 kB[0m [31m861.3 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0mm
[?25hDownloading annotated_doc-0.0.3-py3-none-any.whl (5.5 kB)
Downloading starlette-0.49.1-py3-none-any.whl (74 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m74.2/74.2 kB[0m [31m92.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: annotated-doc, starlette, fastapi
Succes

In [33]:
!python COT_RAG.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[32mINFO[0m:     Started server process [[36m16268[0m]
[32mINFO[0m:     Waiting for application startup.
  embedder = SentenceTransformerEmbeddings(
Creating new Chroma DB at embeddings
  vectordb.persist()
Vector DB ready ‚Äì 3251 chunks, top_k=3
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Uvicorn running on [1mhttp://0.0.0.0:8000[0m (Press CTRL+C to quit)
^C
[32mINFO[0m:     Shutting down
[32mINFO[0m:     Waiting for application shutdown.
Shutting down...


In [34]:
!curl -X POST http://localhost:8000/query -H "Content-Type: application/json" -d '{"question":"What is the command to enable CFM?"}'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/usr/bin/sh: 1: curl: not found
