In [1]:
# Cell 1: Imports
import os
import glob
from pathlib import Path
from langchain_ollama import OllamaLLM

In [2]:
# Cell 2: Initialize LLM
# Initialize the Ollama LLM
llm = OllamaLLM(model="llama3.2:latest", temperature=0.3)

def call_llm(prompt, model="llama3.2:latest"):
    """
    Call the Ollama LLM with the enrichment prompt.
    Returns the model's response as a string.
    """
    # Call the LLM and get the response
    response = llm.invoke(prompt)
    return response

In [3]:
# Cell 3: Configuration
# Directory to read files from
input_dir = "/workspaces/RAG_BOT/PIVIOT/sample"

# Directory to save enriched files
output_dir = "enriched_endpoints"
os.makedirs(output_dir, exist_ok=True)

# Define a consistent enrichment prompt template
enrichment_template = """
You are an expert API documentation assistant. 
Your task is to take the following raw endpoint documentation and rewrite it to make it more RAG-friendly.

For this endpoint, produce:
1. A clear, human-friendly summary
2. List of key search terms
3. 5–10 example questions a user might ask
4. Important details or notes for developers
5. The original formatted documentation (nicely preserved)

---

RAW DOCUMENTATION:

{endpoint_text}

---

Output in a structured markdown format.
"""

In [4]:
# Cell 4: Get input files
# Get all text files from the input directory
input_files = glob.glob(os.path.join(input_dir, "*.txt"))
print(f"Found {len(input_files)} files to process.")

Found 1 files to process.


In [5]:
# Cell 5: Process each file
for file_idx, file_path in enumerate(input_files, 1):
    print(f"Processing file {file_idx}/{len(input_files)}: {file_path}")
    
    # Read the file content
    with open(file_path, "r", encoding="utf-8") as f:
        full_text = f.read()
    
    # Split on the known separator (adjust if it's different in your files)
    endpoint_chunks = [chunk.strip() for chunk in full_text.split('--------------------------------------------------------------------------------') if chunk.strip()]
    
    print(f"  - Found {len(endpoint_chunks)} endpoint sections in {os.path.basename(file_path)}")
    
    # Process each endpoint chunk
    for chunk_idx, endpoint_text in enumerate(endpoint_chunks, 1):
        print(f"  - Processing endpoint {chunk_idx}/{len(endpoint_chunks)}...")
        
        # Prepare the prompt
        prompt = enrichment_template.format(endpoint_text=endpoint_text)
        
        # Get the enriched version from the LLM
        enriched_text = call_llm(prompt)
        
        # Define filename using both the source file name and chunk index
        base_filename = Path(file_path).stem
        filename = f"{base_filename}_endpoint_{chunk_idx:03}.txt"
        
        # Save enriched text
        with open(os.path.join(output_dir, filename), "w", encoding="utf-8") as out_file:
            out_file.write(enriched_text)
        
        print(f"    Saved: {filename}")

Processing file 1/1: /workspaces/RAG_BOT/PIVIOT/sample/PolicyMangement.txt
  - Found 9 endpoint sections in PolicyMangement.txt
  - Processing endpoint 1/9...
    Saved: PolicyMangement_endpoint_001.txt
  - Processing endpoint 2/9...
    Saved: PolicyMangement_endpoint_002.txt
  - Processing endpoint 3/9...
    Saved: PolicyMangement_endpoint_003.txt
  - Processing endpoint 4/9...
    Saved: PolicyMangement_endpoint_004.txt
  - Processing endpoint 5/9...
    Saved: PolicyMangement_endpoint_005.txt
  - Processing endpoint 6/9...
    Saved: PolicyMangement_endpoint_006.txt
  - Processing endpoint 7/9...
    Saved: PolicyMangement_endpoint_007.txt
  - Processing endpoint 8/9...
    Saved: PolicyMangement_endpoint_008.txt
  - Processing endpoint 9/9...
    Saved: PolicyMangement_endpoint_009.txt


In [None]:
# Cell 6: Summary
print("All files processed and saved.")