In [5]:
import os
import json
import re

def load_json_safely(file_path):
    """Load JSON from file_path safely, returning a dict or None if failure."""
    if not os.path.isfile(file_path):
        print(f"❌ File not found: {file_path}")
        return None
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            return json.load(f)
    except json.JSONDecodeError as e:
        print(f"❌ Could not decode JSON from {file_path}: {e}")
        return None

def add_emergency_flag(qa_pairs, emergency_keywords):
    """
    For each QA pair in qa_pairs:
      - Ensure its "answer" is stored as a dictionary with keys:
        "main_points", "examples", "tips", and "related_topics".
      - Check the first element of "main_points" (converted to lowercase)
        for any emergency keywords.
      - If a keyword is found, add the flag "is_emergency": true.
    """
    for qa in qa_pairs:
        # If the answer is a plain string, wrap it in a dictionary.
        if isinstance(qa.get("answer"), str):
            qa["answer"] = {
                "main_points": [qa["answer"].strip()],
                "examples": [],
                "tips": [],
                "related_topics": []
            }
        # Get the main text from the first element of main_points (if any)
        main_text = qa["answer"]["main_points"][0].lower() if qa["answer"]["main_points"] else ""
        if any(keyword in main_text for keyword in emergency_keywords):
            qa["is_emergency"] = True

def merge_knowledge_bases(existing_json_path, new_extracted_json_path, merged_json_path):
    # 1. Load existing data
    existing_data = load_json_safely(existing_json_path)
    if existing_data is None:
        print("❌ Aborting: missing or invalid existing knowledge base JSON.")
        return

    # 2. Load new extracted data
    new_data = load_json_safely(new_extracted_json_path)
    if new_data is None:
        print("❌ Aborting: missing or invalid extracted QA JSON.")
        return

    # Define keywords that indicate generic emergency advice.
    emergency_keywords = ["i need help now", "emergency", "urgent", "crisis"]

    # Process existing QA pairs:
    print("🔍 Checking existing QA pairs for emergency keywords...")
    for qa in existing_data.get("qa_pairs", []):
        # Tag existing QA pairs if not already tagged.
        qa.setdefault("source", "existing")
    add_emergency_flag(existing_data.get("qa_pairs", []), emergency_keywords)

    # Process new QA pairs for emergency flag.
    print("🔍 Checking new QA pairs for emergency keywords...")
    for qa in new_data.get("qa_pairs", []):
        print(f"QA question: {qa.get('question')}, answer type: {type(qa.get('answer'))}")
        # Tag new (extracted) QA pairs.
        qa["source"] = "extracted"
    add_emergency_flag(new_data.get("qa_pairs", []), emergency_keywords)

    # 3. Check existing_data structure
    if "categories" not in existing_data or "qa_pairs" not in existing_data:
        print("❌ Error: 'categories' or 'qa_pairs' missing in existing knowledge base.")
        return

    # 4. Check new_data structure
    if "categories" not in new_data or "qa_pairs" not in new_data:
        print("❌ Error: 'categories' or 'qa_pairs' missing in extracted data.")
        return

    # 5. Merge categories (append only those that are new)
    existing_cat_ids = {cat["id"] for cat in existing_data["categories"]}
    for cat in new_data["categories"]:
        cat_id = cat.get("id")
        if cat_id not in existing_cat_ids:
            existing_data["categories"].append(cat)
            existing_cat_ids.add(cat_id)
        else:
            # Optionally merge subcategories here if needed.
            pass

    # 6. Merge QA pairs (append new ones to existing ones)
    new_qa_pairs = new_data.get("qa_pairs", [])
    existing_data["qa_pairs"].extend(new_qa_pairs)

    # 7. Update metadata
    existing_data["metadata"]["topics_count"] = len(existing_data["categories"])
    existing_data["metadata"]["qa_pairs_count"] = len(existing_data["qa_pairs"])

    # 8. Save merged JSON
    try:
        with open(merged_json_path, "w", encoding="utf-8") as f:
            json.dump(existing_data, f, indent=2, ensure_ascii=False)
        print(f"✅ Merged knowledge base saved to: {merged_json_path}")
    except Exception as e:
        print(f"❌ Error saving merged knowledge base: {e}")

if __name__ == "__main__":
    # Example usage: update with your actual file paths
    base_dir = "C:/Users/osato/openai_setup"
    existing_kb_path = os.path.join(base_dir, "knowledge_base.json")
    extracted_kb_path = os.path.join(base_dir, "extracted_structured.json")
    merged_output_path = os.path.join(base_dir, "merged_knowledge_base.json")

    merge_knowledge_bases(
        existing_json_path=existing_kb_path,
        new_extracted_json_path=extracted_kb_path,
        merged_json_path=merged_output_path
    )


🔍 Checking existing QA pairs for emergency keywords...
🔍 Checking new QA pairs for emergency keywords...
QA question: What is the background of this mental health initiative?, answer type: <class 'dict'>
QA question: What are the main data sources for this report?, answer type: <class 'dict'>
QA question: What are the key themes identified in the report?, answer type: <class 'dict'>
QA question: How was the Student Perspectives Questionnaire conducted?, answer type: <class 'dict'>
QA question: What do students identify as key barriers to support?, answer type: <class 'dict'>
QA question: What is the ideal whole-university approach according to students?, answer type: <class 'dict'>
QA question: How do students define mental health?, answer type: <class 'dict'>
QA question: What role do academic tutors play in student mental health?, answer type: <class 'dict'>
QA question: What are the conclusions for future student engagement?, answer type: <class 'dict'>
QA question: Who are you most

In [6]:
import json
import os
import time
from openai import OpenAI
from pinecone import Pinecone

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")

client = OpenAI(api_key=OPENAI_API_KEY)
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
index = pc.Index("ai-powered-chatbot")

# Path to merged knowledge base
MERGED_JSON_PATH = "C:/Users/osato/openai_setup/merged_knowledge_base.json"

if not os.path.isfile(MERGED_JSON_PATH):
    raise FileNotFoundError("❌ Merged knowledge base file not found.")

# Load merged knowledge base
with open(MERGED_JSON_PATH, "r", encoding="utf-8") as f:
    knowledgebase = json.load(f)

qa_pairs = knowledgebase.get("qa_pairs", [])
print(f"✅ Loaded {len(qa_pairs)} QA pairs from merged file.")

# Re-index the knowledge base
def index_qa_pairs(pairs):
    for idx, qa in enumerate(pairs):
        question = qa.get("question", "").strip()

        # Ensure "main_points" is a list and join its elements into a single string
        answer_list = qa.get("answer", {}).get("main_points", [])
        if isinstance(answer_list, list):
            answer = " ".join(answer_list).strip()
        else:
            answer = str(answer_list).strip()  # Handle unexpected cases

        full_text = f"Q: {question}\nA: {answer}"

        # Generate embeddings
        response = client.embeddings.create(
            model="text-embedding-ada-002",
            input=full_text
        )
        vector = response.data[0].embedding
        
        index.upsert([
    {
        "id": f"qa_{idx}",
        "values": vector,
        "metadata": {
            "question": question,
            "answer": answer
        }
    }
    ],)

        time.sleep(0.1)  # Avoid hitting rate limits

print("🔄 Re-indexing QA pairs...")
index_qa_pairs(qa_pairs)
print("✅ Pinecone index updated.")


✅ Loaded 231 QA pairs from merged file.
🔄 Re-indexing QA pairs...
✅ Pinecone index updated.
