In [6]:
import os
import json
import time
from datetime import date
from dotenv import load_dotenv
import tiktoken
from openai import OpenAI
from pinecone import Pinecone

# --------------------------------------------------------------------------
# 1. SETUP
# --------------------------------------------------------------------------
load_dotenv()  # Load from .env if present

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")

if not OPENAI_API_KEY:
    raise ValueError("❌ Missing OPENAI_API_KEY in environment or .env.")
if not PINECONE_API_KEY:
    raise ValueError("❌ Missing PINECONE_API_KEY in environment or .env.")
if not PINECONE_ENV:
    raise ValueError("❌ Missing PINECONE_ENV in environment or .env.")

# Instantiate OpenAI and Pinecone clients
client = OpenAI(api_key=OPENAI_API_KEY)
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
index = pc.Index("ai-powered-chatbot")  # Connect to existing Pinecone index

# Paths
MERGED_JSON_PATH = r"C:/Users/osato/openai_setup/merged_knowledge_base.json"
EMBEDDINGS_OUTPUT_PATH = r"C:/Users/osato/openai_setup/knowledgebase_embeddings.json"

# --------------------------------------------------------------------------
# 2. LOAD MERGED KNOWLEDGE BASE
# --------------------------------------------------------------------------
if not os.path.isfile(MERGED_JSON_PATH):
    raise FileNotFoundError(f"❌ File not found: {MERGED_JSON_PATH}")

with open(MERGED_JSON_PATH, "r", encoding="utf-8") as f:
    knowledgebase = json.load(f)

qa_pairs = knowledgebase.get("qa_pairs", [])
num_qas = len(qa_pairs)
print(f"✅ Loaded {num_qas} QA entries from: {MERGED_JSON_PATH}")

if num_qas == 0:
    print("⚠️ No QA pairs found. Nothing to embed. Exiting.")
    raise SystemExit

# --------------------------------------------------------------------------
# 3. TOKENIZER & CHUNKING UTILS
# --------------------------------------------------------------------------
tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002")

def count_tokens(text: str) -> int:
    """Count tokens for text using the text-embedding-ada-002 tokenizer."""
    return len(tokenizer.encode(text))

def split_text_by_tokens(text: str, max_tokens: int = 512) -> list[str]:
    """Naive word-split approach ensuring <= max_tokens in each chunk."""
    words = text.split()
    chunks = []
    current_words = []

    for word in words:
        current_words.append(word)
        if count_tokens(" ".join(current_words)) > max_tokens:
            # finalize current chunk
            current_words.pop()
            chunk_str = " ".join(current_words).strip()
            if chunk_str:
                chunks.append(chunk_str)
            current_words = [word]

    # leftover words
    if current_words:
        leftover_str = " ".join(current_words).strip()
        if leftover_str:
            chunks.append(leftover_str)

    return chunks

# --------------------------------------------------------------------------
# 4. EMBEDDING LOGIC (UPDATING PINECONE CORRECTLY)
# --------------------------------------------------------------------------
def embed_qa_pairs(pairs):
    """
    For each QA pair:
      - Embed the text.
      - Ensure answers are structured properly.
      - Store metadata in Pinecone.
    """
    total = len(pairs)
    print(f"🔵 Embedding {total} QA pairs...")

    for idx, qa in enumerate(pairs):
        doc_id = qa.get("id", f"qa_{idx}")
        category = qa.get("category_id", "unknown")
        source = qa.get("source", "unknown")  # "existing" or "extracted"
        is_emergency = qa.get("is_emergency", False)
        question = (qa.get("question") or "").strip()

        # Ensure answer is a structured dictionary
        ans_block = qa.get("answer", {})
        if not isinstance(ans_block, dict):
            ans_block = {
                "main_points": [str(ans_block).strip()],
                "examples": [],
                "tips": [],
                "related_topics": []
            }

        # Combine answer fields into a single string
        main_points = " ".join(ans_block.get("main_points", []))
        examples = " ".join(ans_block.get("examples", []))
        tips = " ".join(ans_block.get("tips", []))
        rel_topics = " ".join(ans_block.get("related_topics", []))
        combined_answer = f"{main_points}\n{examples}\n{tips}\n{rel_topics}".strip()

        if not question or not combined_answer:
            print(f"⚠️ Skipping doc_id='{doc_id}' due to empty question or answer.")
            continue

        full_text = f"Q: {question}\nA: {combined_answer}"
        full_len = count_tokens(full_text)

        # If the text is too long, split it into chunks
        if full_len > 8192:
            text_chunks = split_text_by_tokens(full_text, max_tokens=512)
        else:
            text_chunks = [full_text]

        print(f"   • {idx+1}/{total} => doc_id='{doc_id}', {len(text_chunks)} chunk(s), tokens={full_len}")

        for c_idx, chunk_str in enumerate(text_chunks):
            try:
                # Generate embedding
                resp = client.embeddings.create(
                    model="text-embedding-ada-002",
                    input=chunk_str
                )
                vector = resp.data[0].embedding

                chunk_id = f"{doc_id}_chunk{c_idx}"
                index.upsert([
                    {
                        "id": chunk_id,
                        "values": vector,
                        "metadata": {
                            "doc_id": doc_id,
                            "category": category,
                            "source": source,
                            "is_emergency": is_emergency,
                            "text_chunk": chunk_str
                        }
                    }
                ])

                time.sleep(0.2)  # Avoid rate limits

            except Exception as e:
                print(f"❌ Error embedding doc_id={doc_id}, chunk={c_idx}: {e}")

# --------------------------------------------------------------------------
# 5. MAIN - EMBED & SAVE
# --------------------------------------------------------------------------
if __name__ == "__main__":
    embed_qa_pairs(qa_pairs)
    print("✅ Embedding process completed.")

✅ Loaded 415 QA entries from: C:/Users/osato/openai_setup/merged_knowledge_base.json
🔵 Embedding 415 QA pairs...
   • 1/415 => doc_id='time_and_task_management_001', 1 chunk(s), tokens=207
   • 2/415 => doc_id='time_and_task_management_002', 1 chunk(s), tokens=106
   • 3/415 => doc_id='time_and_task_management_003', 1 chunk(s), tokens=106
   • 4/415 => doc_id='time_and_task_management_004', 1 chunk(s), tokens=57
   • 5/415 => doc_id='time_and_task_management_005', 1 chunk(s), tokens=94
   • 6/415 => doc_id='time_and_task_management_006', 1 chunk(s), tokens=391
   • 7/415 => doc_id='time_and_task_management_007', 1 chunk(s), tokens=106
   • 8/415 => doc_id='time_and_task_management_008', 1 chunk(s), tokens=81
   • 9/415 => doc_id='time_and_task_management_009', 1 chunk(s), tokens=85
   • 10/415 => doc_id='time_and_task_management_010', 1 chunk(s), tokens=88
   • 11/415 => doc_id='time_and_task_management_011', 1 chunk(s), tokens=79
   • 12/415 => doc_id='time_and_task_management_012', 1

   • 103/415 => doc_id='quick_tips_for_personal_finance_and_budgeting_001', 1 chunk(s), tokens=168
   • 104/415 => doc_id='digital_literacy_tools_001', 1 chunk(s), tokens=101
   • 105/415 => doc_id='digital_literacy_tools_002', 1 chunk(s), tokens=78
   • 106/415 => doc_id='digital_literacy_tools_003', 1 chunk(s), tokens=120
   • 107/415 => doc_id='digital_literacy_tools_004', 1 chunk(s), tokens=105
   • 108/415 => doc_id='digital_literacy_tools_005', 1 chunk(s), tokens=128
   • 109/415 => doc_id='quick_tips_for_digital_literacy_tools_001', 1 chunk(s), tokens=182
   • 110/415 => doc_id='advice_for_freshers_001', 1 chunk(s), tokens=116
   • 111/415 => doc_id='advice_for_freshers_002', 1 chunk(s), tokens=108
   • 112/415 => doc_id='advice_for_freshers_003', 1 chunk(s), tokens=118
   • 113/415 => doc_id='advice_for_freshers_004', 1 chunk(s), tokens=114
   • 114/415 => doc_id='advice_for_freshers_005', 1 chunk(s), tokens=140
   • 115/415 => doc_id='quick_tips_for_freshers_001', 1 chunk(s), 

   • 197/415 => doc_id='student_check_in_timetables_and_welcome_activities_012', 1 chunk(s), tokens=114
   • 198/415 => doc_id='student_check_in_timetables_and_welcome_activities_013', 1 chunk(s), tokens=161
   • 199/415 => doc_id='student_check_in_timetables_and_welcome_activities_014', 1 chunk(s), tokens=58
   • 200/415 => doc_id='extensions_and_late_submissions_at_university_of_wolverhampton_015', 1 chunk(s), tokens=55
   • 201/415 => doc_id='extensions_and_late_submissions_at_university_of_wolverhampton_016', 1 chunk(s), tokens=119
   • 202/415 => doc_id='extensions_and_late_submissions_at_university_of_wolverhampton_017', 1 chunk(s), tokens=55
   • 203/415 => doc_id='extensions_and_late_submissions_at_university_of_wolverhampton_018', 1 chunk(s), tokens=43
   • 204/415 => doc_id='extensions_and_late_submissions_at_university_of_wolverhampton_019', 1 chunk(s), tokens=78
   • 205/415 => doc_id='extensions_and_late_submissions_at_university_of_wolverhampton_020', 1 chunk(s), tokens=3

   • 278/415 => doc_id='how_to_make_studying_fun_and_less_stressful_021', 1 chunk(s), tokens=64
   • 279/415 => doc_id='how_to_make_studying_fun_and_less_stressful_022', 1 chunk(s), tokens=64
   • 280/415 => doc_id='how_to_make_studying_fun_and_less_stressful_023', 1 chunk(s), tokens=52
   • 281/415 => doc_id='how_to_make_studying_fun_and_less_stressful_024', 1 chunk(s), tokens=51
   • 282/415 => doc_id='how_to_make_studying_fun_and_less_stressful_025', 1 chunk(s), tokens=67
   • 283/415 => doc_id='how_to_make_studying_fun_and_less_stressful_026', 1 chunk(s), tokens=70
   • 284/415 => doc_id='placement_and_internship_information_at_university_of_wolverhampton_027', 1 chunk(s), tokens=86
   • 285/415 => doc_id='placement_and_internship_information_at_university_of_wolverhampton_028', 1 chunk(s), tokens=73
   • 286/415 => doc_id='placement_and_internship_information_at_university_of_wolverhampton_029', 1 chunk(s), tokens=134
   • 287/415 => doc_id='placement_and_internship_information_at

   • 358/415 => doc_id='facing_culture_shock_039', 1 chunk(s), tokens=57
   • 359/415 => doc_id='facing_culture_shock_040', 1 chunk(s), tokens=67
   • 360/415 => doc_id='facing_culture_shock_041', 1 chunk(s), tokens=57
   • 361/415 => doc_id='accessing_support_at_university_001', 1 chunk(s), tokens=53
   • 362/415 => doc_id='accessing_support_at_university_002', 1 chunk(s), tokens=95
   • 363/415 => doc_id='accessing_support_at_university_003', 1 chunk(s), tokens=77
   • 364/415 => doc_id='accessing_support_at_university_004', 1 chunk(s), tokens=56
   • 365/415 => doc_id='accessing_support_at_university_005', 1 chunk(s), tokens=75
   • 366/415 => doc_id='accessing_support_at_university_006', 1 chunk(s), tokens=72
   • 367/415 => doc_id='accessing_support_at_university_007', 1 chunk(s), tokens=47
   • 368/415 => doc_id='accessing_support_at_university_008', 1 chunk(s), tokens=111
   • 369/415 => doc_id='accessing_support_at_university_009', 1 chunk(s), tokens=87
   • 370/415 => doc_id='

In [9]:
!pip install tiktoken



In [None]:
pip show openai
