### **Install Required Libraries**

In [30]:
!pip install -q sentence-transformers scikit-learn tiktoken numpy

### **Import Dependencies**



In [31]:
import json
import time
import numpy as np
from google.colab import files
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import tiktoken

## **Uploading input JSON FILES**

In [32]:
print("Upload EXACTLY 4 JSON files:")
print("• 2 chat JSON files (name must contain 'chat')")
print("• 2 context JSON files (name must contain 'context')")

uploaded_files = files.upload()
uploaded_filenames = list(uploaded_files.keys())

print("\nUploaded files:")
for f in uploaded_filenames:
    print(" -", f)

Upload EXACTLY 4 JSON files:
• 2 chat JSON files (name must contain 'chat')
• 2 context JSON files (name must contain 'context')


Saving sample_context_vectors-02.json to sample_context_vectors-02 (2).json
Saving sample_context_vectors-01.json to sample_context_vectors-01 (2).json
Saving sample-chat-conversation-01.json to sample-chat-conversation-01 (3).json
Saving sample-chat-conversation-02.json to sample-chat-conversation-02 (3).json

Uploaded files:
 - sample_context_vectors-02 (2).json
 - sample_context_vectors-01 (2).json
 - sample-chat-conversation-01 (3).json
 - sample-chat-conversation-02 (3).json


# **identify, filter, and validate the uploaded files**

In [33]:
import re

def extract_index(filename):
    """
    Extract the first number from the filename.
    Returns -1 if no number is found.
    """
    match = re.search(r'(\d+)', filename)
    return int(match.group(1)) if match else -1


In [34]:
# Separate chat and context files
chat_files = [f for f in uploaded_filenames if "chat" in f.lower()]
context_files = [f for f in uploaded_filenames if "context" in f.lower()]

# Build dictionaries: index -> filename
chat_dict = {extract_index(f): f for f in chat_files}
context_dict = {extract_index(f): f for f in context_files}

# Find common indices
common_indices = sorted(set(chat_dict.keys()) & set(context_dict.keys()))

if not common_indices:
    raise ValueError("❌ No matching chat-context file pairs found!")

# Final ordered lists
chat_files_ordered = [chat_dict[i] for i in common_indices]
context_files_ordered = [context_dict[i] for i in common_indices]

print("Ordered chat files:", chat_files_ordered)
print("Ordered context files:", context_files_ordered)


Ordered chat files: ['sample-chat-conversation-01 (3).json', 'sample-chat-conversation-02 (3).json']
Ordered context files: ['sample_context_vectors-01 (2).json', 'sample_context_vectors-02 (2).json']


In [35]:
'''chat_files = [f for f in uploaded_filenames if "chat" in f.lower()]
context_files = [f for f in uploaded_filenames if "context" in f.lower()]

if len(chat_files) != 2 or len(context_files) != 2:
    raise ValueError(
        "❌ Error: Upload exactly 2 chat JSON files and 2 context JSON files"
    )

print("\nChat JSONs:", chat_files)
print("Context JSONs:", context_files)'''

'chat_files = [f for f in uploaded_filenames if "chat" in f.lower()]\ncontext_files = [f for f in uploaded_filenames if "context" in f.lower()]\n\nif len(chat_files) != 2 or len(context_files) != 2:\n    raise ValueError(\n        "❌ Error: Upload exactly 2 chat JSON files and 2 context JSON files"\n    )\n\nprint("\nChat JSONs:", chat_files)\nprint("Context JSONs:", context_files)'

# **Safely load JSON data from  files**

In [36]:
def load_json(file_name):
    try:
        with open(file_name, "r", encoding="utf-8") as f:
            return json.load(f)
    except json.JSONDecodeError as e:
        print(f"❌ Error: Failed to decode JSON from file '{file_name}'. Please check the file for syntax errors or truncation.")
        print(f"  Details: {e}")
        raise  # Re-raise the exception after printing the informative message

chat_jsons = [load_json(f) for f in chat_files]
context_jsons = [load_json(f) for f in context_files]

# To Validate and debug an uploaded JSON file (uncomment it)

In [37]:
'''file_content = uploaded_files.get('sample_context_vectors-01 (2).json')

if file_content is None:
    print("❌ Error: 'sample-chat-conversation-01.json' not found in uploaded files.")
else:
    try:
        # Decode bytes to string
        file_string = file_content.decode('utf-8')
        json.loads(file_string)
        print("✅ 'sample-chat-conversation-01.json' is a valid JSON file.")
    except json.JSONDecodeError as e:
        print(f"❌ Error: Invalid JSON in 'sample-chat-conversation-01.json'.")
        print(f"  Details: {e}")

        # Extract the line number from the error message
        import re
        match = re.search(r'line (\d+) column (\d+)', str(e))
        if match:
            line_num = int(match.group(1))
            char_pos = int(match.group(2))
            lines = file_string.splitlines()

            # Print lines around the error
            start_line = max(0, line_num - 3)
            end_line = min(len(lines), line_num + 2)

            print(f"\nProblematic section around line {line_num} (character {char_pos}):")
            for i in range(start_line, end_line):
                line_content = lines[i]
                print(f"{i + 1:4d}: {line_content}")
                if i == line_num - 1:
                    print(f"      {' ' * (char_pos - 1)}^")
        else:
            print("Could not extract line and column number from error details.")'''

  match = re.search(r'line (\d+) column (\d+)', str(e))


'file_content = uploaded_files.get(\'sample_context_vectors-01 (2).json\')\n\nif file_content is None:\n    print("❌ Error: \'sample-chat-conversation-01.json\' not found in uploaded files.")\nelse:\n    try:\n        # Decode bytes to string\n        file_string = file_content.decode(\'utf-8\')\n        json.loads(file_string)\n        print("✅ \'sample-chat-conversation-01.json\' is a valid JSON file.")\n    except json.JSONDecodeError as e:\n        print(f"❌ Error: Invalid JSON in \'sample-chat-conversation-01.json\'.")\n        print(f"  Details: {e}")\n\n        # Extract the line number from the error message\n        import re\n        match = re.search(r\'line (\\d+) column (\\d+)\', str(e))\n        if match:\n            line_num = int(match.group(1))\n            char_pos = int(match.group(2))\n            lines = file_string.splitlines()\n\n            # Print lines around the error\n            start_line = max(0, line_num - 3)\n            end_line = min(len(lines), line

# **To Check the Data format inside the Json file**

In [38]:
from pprint import pprint
pprint(context_jsons[0])


{'data': {'sources': {'final_response': ['For Gopal Mansion, an '
                                         'air-conditioned room with TV and '
                                         'bath is Rs 800 per night.',
                                         'We also offer specially subsidized '
                                         'air-conditioned rooms at our clinic '
                                         'for Rs 2000 (US $50) per night, and '
                                         'non-AC rooms for Rs 1500 (US$ 40) '
                                         'per night, including free breakfast.',
                                         'Happy Home Hotel, which is a '
                                         '5-minute walk from our clinic, '
                                         'offers single rooms for Rs 1400 and '
                                         'double rooms for Rs 2000.',
                                         'To discuss your specific needs and '
            

## **Loading Embedding MODEL(Lighweight and Fast)**

In [39]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

## **Calculate Relevance & Completeness Scores**

In [40]:
def relevance_completeness_score(query, response, contexts):
    # Clean contexts again (defensive programming)
    contexts = [c.strip() for c in contexts if isinstance(c, str) and c.strip()]

    # 🚨 HARD GUARD — prevents (0, 384) error
    if len(contexts) == 0:
        return {
            "relevance_score": round(
                float(cosine_similarity(
                    embedding_model.encode([query]),
                    embedding_model.encode([response])
                )[0][0]), 3
            ),
            "completeness_score": None,
            "note": "Completeness skipped — no valid contexts"
        }

    texts = [query, response] + contexts
    embeddings = embedding_model.encode(texts)

    query_emb = embeddings[0].reshape(1, -1)
    response_emb = embeddings[1].reshape(1, -1)
    context_embs = embeddings[2:]

    # 🚨 SECOND GUARD — embeddings dropped internally
    if context_embs.shape[0] == 0:
        return {
            "relevance_score": round(
                float(cosine_similarity(query_emb, response_emb)[0][0]), 3
            ),
            "completeness_score": None,
            "note": "Completeness skipped — empty context embeddings"
        }

    relevance = cosine_similarity(query_emb, response_emb)[0][0]
    completeness = float(
        np.mean(cosine_similarity(response_emb, context_embs))
    )

    return {
        "relevance_score": round(float(relevance), 3),
        "completeness_score": round(completeness, 3)
    }


## **Hallucination/Factual Accuracy Detection**

In [41]:
def hallucination_detection(response, contexts, threshold=0.65):
    # Defensive cleaning
    contexts = [c.strip() for c in contexts if isinstance(c, str) and c.strip()]

    # 🚨 HARD GUARD — no contexts
    if len(contexts) == 0:
        return {
            "max_context_similarity": None,
            "hallucination_detected": None,
            "note": "Hallucination check skipped — no valid contexts"
        }

    response_emb = embedding_model.encode([response])
    context_embs = embedding_model.encode(contexts)

    # 🚨 SECOND GUARD — embeddings dropped internally
    if context_embs.shape[0] == 0:
        return {
            "max_context_similarity": None,
            "hallucination_detected": None,
            "note": "Hallucination check skipped — empty context embeddings"
        }

    similarities = cosine_similarity(response_emb, context_embs)[0]
    max_similarity = float(np.max(similarities))

    return {
        "max_context_similarity": round(max_similarity, 3),
        "hallucination_detected": max_similarity < threshold
    }


## **Latency Function**

In [42]:
import time

def track_latency(start_time):
    """
    Returns latency in seconds (float, rounded to 3 decimals).
    """
    latency = time.time() - start_time
    return round(latency, 3)


## **Cost Estimate Function**

In [43]:
def estimate_cost(text, cost_per_1k_tokens=0.002):
    """
    Estimates LLM cost in USD based on text length.
    """
    if not text or not isinstance(text, str):
        return 0.0

    # Approx token estimation
    tokens = len(text) / 4

    cost = (tokens / 1000) * cost_per_1k_tokens
    return round(cost, 5)


## **To extract and prepare textual context from your context JSON**

In [44]:
def clean_contexts(context_json):
    contexts = []

    # 1️⃣ Primary: retrieved passages
    final_responses = (
        context_json
        .get("data", {})
        .get("sources", {})
        .get("final_response", [])
    )

    for text in final_responses:
        if isinstance(text, str) and text.strip():
            contexts.append(text.strip())

    # 2️⃣ Fallback: full vector documents
    if not contexts:
        for v in context_json.get("data", {}).get("vector_data", []):
            text = v.get("text", "")
            if text and text.strip():
                contexts.append(text.strip())

    return contexts


## **Main Evaluation Pipeline**

In [45]:
def evaluate_llm_response(chat_json, context_json):
    start_time = time.time()

    turns = chat_json.get("conversation_turns", [])

    user_query = None
    ai_response = None

    for turn in reversed(turns):
        if not ai_response and turn.get("role") == "AI/Chatbot":
            ai_response = turn.get("message")
        elif not user_query and turn.get("role") == "User":
            user_query = turn.get("message")

        if user_query and ai_response:
            break

    if not user_query or not ai_response:
        return None

    contexts = clean_contexts(context_json)

    # Hard stop if no raw contexts
    if not contexts:
        return {
            "relevance_and_completeness": "SKIPPED — no contexts found",
            "hallucination_check": "SKIPPED — no contexts found",
            "latency_seconds": track_latency(start_time),
            "estimated_cost_usd": estimate_cost(user_query + ai_response)
        }

    relevance_results = relevance_completeness_score (
        user_query, ai_response, contexts
    )

    hallucination_results = hallucination_detection(
        ai_response, contexts
    )

    return {
        "relevance_and_completeness": relevance_results,
        "hallucination_check": hallucination_results,
        "latency_seconds": track_latency(start_time),
        "estimated_cost_usd": estimate_cost(user_query + ai_response)
    }


## **Run the Evaluation**

In [46]:
final_results = []

for i in range(len(common_indices)):
    chat_json = load_json(chat_files_ordered[i])
    context_json = load_json(context_files_ordered[i])

    evaluation = evaluate_llm_response(chat_json, context_json)

    final_results.append({
        "chat_file": chat_files_ordered[i],
        "context_file": context_files_ordered[i],
        "evaluation_result": evaluation
    })


## **FINAL OUTPUT**

In [47]:
print(json.dumps(final_results, indent=2))

[
  {
    "chat_file": "sample-chat-conversation-01 (3).json",
    "context_file": "sample_context_vectors-01 (2).json",
    "evaluation_result": {
      "relevance_and_completeness": {
        "relevance_score": 0.718,
        "completeness_score": 0.264
      },
      "hallucination_check": {
        "max_context_similarity": 0.575,
        "hallucination_detected": true
      },
      "latency_seconds": 0.706,
      "estimated_cost_usd": 0.0003
    }
  },
  {
    "chat_file": "sample-chat-conversation-02 (3).json",
    "context_file": "sample_context_vectors-02 (2).json",
    "evaluation_result": {
      "relevance_and_completeness": {
        "relevance_score": 0.343,
        "completeness_score": 0.046
      },
      "hallucination_check": {
        "max_context_similarity": 0.237,
        "hallucination_detected": true
      },
      "latency_seconds": 0.491,
      "estimated_cost_usd": 0.00036
    }
  }
]


In [48]:
import json
with open("evaluation_results.json", "w") as f:
    json.dump(final_results, f, indent=2)
