In [1]:
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import requests


In [None]:
doc_paths = [
    Path("data/doc1.md"),
    Path("data/doc2.md"),
    Path("data/doc3.md")
]

docs = [p.read_text(encoding="utf-8") for p in doc_paths]
print(f"Loaded {len(docs)} documents")


Loaded 3 documents


In [3]:
def chunk_text(text, max_words=200):
    chunks, current = [], ""
    for para in text.split("\n\n"):
        if len((current + para).split()) <= max_words:
            current += "\n" + para
        else:
            chunks.append(current.strip())
            current = para
    if current.strip():
        chunks.append(current.strip())
    return chunks

chunks = []
for d in docs:
    chunks.extend(chunk_text(d))

print(f"Created {len(chunks)} chunks")


Created 11 chunks


In [4]:
vectorizer = TfidfVectorizer(stop_words="english")
chunk_vectors = vectorizer.fit_transform(chunks)

print("TF-IDF embeddings created")

TF-IDF embeddings created


In [32]:
def retrieve(query, top_k=5, score_threshold=0.2):
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, chunk_vectors)[0]

    ranked = sorted(
        enumerate(similarities),
        key=lambda x: x[1],
        reverse=True
    )

    results = []
    for idx, score in ranked[:top_k]:
        if score >= score_threshold:
            results.append((chunks[idx], round(score, 3)))

    return results


In [33]:
def call_ollama(prompt, model="qwen2.5:3b"):
    response = requests.post(
        "http://localhost:11434/api/generate",
        json={
            "model": model,
            "prompt": prompt,
            "stream": False
        }
    )
    return response.json()["response"]


In [47]:
def generate_answer(query, contexts):
    if not contexts:
        return "The information is not available in the provided documents."
    contexts = contexts[:3]

    context_text = "\n\n".join(
        f"(score={s}) {c}" for c, s in contexts
    )

    prompt = f"""
You are an AI assistant for a construction marketplace.

Rules:
- Answer ONLY by restating or summarizing information explicitly stated in the context.
- When a question asks "what" or "which", list the explicitly enumerated items verbatim.
- Do NOT infer causes, impacts, missing conditions, or inverse scenarios.
- Do NOT reframe listed mechanisms as factors or effects.
- Do NOT use external knowledge.
- Do NOT use causal verbs such as "affects", "helps", "leads to", or "results in".
- Use neutral verbs such as "describes" or "lists" when summarizing content.
- If a section heading includes a brief description, expand it using the explicitly stated details in the context.
- If the answer is not present, say:
  "The information is not available in the provided documents."

Context:
{context_text}

Question:
{query}

Answer:
"""
    return call_ollama(prompt)


In [48]:
query = "What factors affect construction project delays?"

contexts = retrieve(query)

print("Retrieved Context:\n")
for c, s in contexts:
    print(f"[score={s}]")
    print(c)
    print("----")

answer = generate_answer(query, contexts)

print("\nFinal Answer:\n")
print(answer)


Retrieved Context:

[score=0.252]
# INDECIMAL — Customer Protection Policies, Quality System, and Guarantees (Internal Reference)
Version: 1.0  
Audience: Support, Ops, Project Management, AI Assistant Knowledge Base  
Last Updated: 2025-12-21  
## 1) Payment Safety & Stage Controls
### Escrow-Based Payment Model (Concept)
- Customer payments are made to an escrow account.
- A project manager verifies stage completion.
- Funds are disbursed to the construction partner after verification.
Purpose: reduce financial risk for customers and improve transparency and trust.
## 2) Delay Management & Accountability
### Zero-Tolerance Policy on Construction Delays (Operational Mechanisms)
Indecimal positions a system-driven approach to on-time delivery using:
- Integrated project management system
- Daily tracking of projects
- Instant flagging of deviations
- Automated task assignment
- Penalisation to reinforce accountability
## 3) Quality Assurance System
### 445+ Critical Checkpoints
- The q