In [15]:
import os
import json
import time
from datetime import date
from dotenv import load_dotenv
import tiktoken
from openai import OpenAI
from pinecone import Pinecone

# --------------------------------------------------------------------------
# 1. SETUP
# --------------------------------------------------------------------------
load_dotenv()  # Load from .env if present

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")

if not OPENAI_API_KEY:
    raise ValueError("❌ Missing OPENAI_API_KEY in environment or .env.")
if not PINECONE_API_KEY:
    raise ValueError("❌ Missing PINECONE_API_KEY in environment or .env.")
if not PINECONE_ENV:
    raise ValueError("❌ Missing PINECONE_ENV in environment or .env.")

# Instantiate OpenAI and Pinecone clients
client = OpenAI(api_key=OPENAI_API_KEY)
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
index = pc.Index("ai-powered-chatbot")  # Connect to existing Pinecone index

# Paths
MERGED_JSON_PATH = r"C:/Users/osato/openai_setup/merged_knowledge_base.json"
EMBEDDINGS_OUTPUT_PATH = r"C:/Users/osato/openai_setup/knowledgebase_embeddings.json"

# --------------------------------------------------------------------------
# 2. LOAD MERGED KNOWLEDGE BASE
# --------------------------------------------------------------------------
if not os.path.isfile(MERGED_JSON_PATH):
    raise FileNotFoundError(f"❌ File not found: {MERGED_JSON_PATH}")

with open(MERGED_JSON_PATH, "r", encoding="utf-8") as f:
    knowledgebase = json.load(f)

qa_pairs = knowledgebase.get("qa_pairs", [])
num_qas = len(qa_pairs)
print(f"✅ Loaded {num_qas} QA entries from: {MERGED_JSON_PATH}")

if num_qas == 0:
    print("⚠️ No QA pairs found. Nothing to embed. Exiting.")
    raise SystemExit

# --------------------------------------------------------------------------
# 3. TOKENIZER & CHUNKING UTILS
# --------------------------------------------------------------------------
tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002")

def count_tokens(text: str) -> int:
    """Count tokens for text using the text-embedding-ada-002 tokenizer."""
    return len(tokenizer.encode(text))

def split_text_by_tokens(text: str, max_tokens: int = 512) -> list[str]:
    """Naive word-split approach ensuring <= max_tokens in each chunk."""
    words = text.split()
    chunks = []
    current_words = []

    for word in words:
        current_words.append(word)
        if count_tokens(" ".join(current_words)) > max_tokens:
            # finalize current chunk
            current_words.pop()
            chunk_str = " ".join(current_words).strip()
            if chunk_str:
                chunks.append(chunk_str)
            current_words = [word]

    # leftover words
    if current_words:
        leftover_str = " ".join(current_words).strip()
        if leftover_str:
            chunks.append(leftover_str)

    return chunks

# --------------------------------------------------------------------------
# 4. EMBEDDING LOGIC (UPDATING PINECONE CORRECTLY)
# --------------------------------------------------------------------------
def embed_qa_pairs(pairs):
    """
    For each QA pair:
      - Embed the text.
      - Ensure answers are structured properly.
      - Store metadata in Pinecone.
    """
    total = len(pairs)
    print(f"🔵 Embedding {total} QA pairs...")

    for idx, qa in enumerate(pairs):
        doc_id = qa.get("id", f"qa_{idx}")
        category = qa.get("category_id", "unknown")
        source = qa.get("source", "unknown")  # "existing" or "extracted"
        is_emergency = qa.get("is_emergency", False)
        question = (qa.get("question") or "").strip()

        # Ensure answer is a structured dictionary
        ans_block = qa.get("answer", {})
        if not isinstance(ans_block, dict):
            ans_block = {
                "main_points": [str(ans_block).strip()],
                "examples": [],
                "tips": [],
                "related_topics": []
            }

        # Combine answer fields into a single string
        main_points = " ".join(ans_block.get("main_points", []))
        examples = " ".join(ans_block.get("examples", []))
        tips = " ".join(ans_block.get("tips", []))
        rel_topics = " ".join(ans_block.get("related_topics", []))
        combined_answer = f"{main_points}\n{examples}\n{tips}\n{rel_topics}".strip()

        if not question or not combined_answer:
            print(f"⚠️ Skipping doc_id='{doc_id}' due to empty question or answer.")
            continue

        full_text = f"Q: {question}\nA: {combined_answer}"
        full_len = count_tokens(full_text)

        # If the text is too long, split it into chunks
        if full_len > 8192:
            text_chunks = split_text_by_tokens(full_text, max_tokens=512)
        else:
            text_chunks = [full_text]

        print(f"   • {idx+1}/{total} => doc_id='{doc_id}', {len(text_chunks)} chunk(s), tokens={full_len}")

        for c_idx, chunk_str in enumerate(text_chunks):
            try:
                # Generate embedding
                resp = client.embeddings.create(
                    model="text-embedding-ada-002",
                    input=chunk_str
                )
                vector = resp.data[0].embedding

                chunk_id = f"{doc_id}_chunk{c_idx}"
                index.upsert([
                    {
                        "id": chunk_id,
                        "values": vector,
                        "metadata": {
                            "doc_id": doc_id,
                            "category": category,
                            "source": source,
                            "is_emergency": is_emergency,
                            "text_chunk": chunk_str
                        }
                    }
                ])

                time.sleep(0.2)  # Avoid rate limits

            except Exception as e:
                print(f"❌ Error embedding doc_id={doc_id}, chunk={c_idx}: {e}")

# --------------------------------------------------------------------------
# 5. MAIN - EMBED & SAVE
# --------------------------------------------------------------------------
if __name__ == "__main__":
    embed_qa_pairs(qa_pairs)
    print("✅ Embedding process completed.")

✅ Loaded 231 QA entries from: C:/Users/osato/openai_setup/merged_knowledge_base.json
🔵 Embedding 231 QA pairs...
   • 1/231 => doc_id='time_and_task_management_001', 1 chunk(s), tokens=191
   • 2/231 => doc_id='time_and_task_management_002', 1 chunk(s), tokens=106
   • 3/231 => doc_id='time_and_task_management_003', 1 chunk(s), tokens=106
   • 4/231 => doc_id='time_and_task_management_004', 1 chunk(s), tokens=106
   • 5/231 => doc_id='time_and_task_management_005', 1 chunk(s), tokens=81
   • 6/231 => doc_id='time_and_task_management_006', 1 chunk(s), tokens=85
   • 7/231 => doc_id='time_and_task_management_007', 1 chunk(s), tokens=88
   • 8/231 => doc_id='time_and_task_management_008', 1 chunk(s), tokens=79
   • 9/231 => doc_id='time_and_task_management_009', 1 chunk(s), tokens=72
   • 10/231 => doc_id='time_and_task_management_010', 1 chunk(s), tokens=83
   • 11/231 => doc_id='quick_tips_for_success_001', 1 chunk(s), tokens=274
   • 12/231 => doc_id='academic_performance_001', 1 chunk

   • 102/231 => doc_id='digital_literacy_tools_002', 1 chunk(s), tokens=78
   • 103/231 => doc_id='digital_literacy_tools_003', 1 chunk(s), tokens=120
   • 104/231 => doc_id='digital_literacy_tools_004', 1 chunk(s), tokens=105
   • 105/231 => doc_id='digital_literacy_tools_005', 1 chunk(s), tokens=128
   • 106/231 => doc_id='quick_tips_for_digital_literacy_tools_001', 1 chunk(s), tokens=182
   • 107/231 => doc_id='advice_for_freshers_001', 1 chunk(s), tokens=116
   • 108/231 => doc_id='advice_for_freshers_002', 1 chunk(s), tokens=108
   • 109/231 => doc_id='advice_for_freshers_003', 1 chunk(s), tokens=118
   • 110/231 => doc_id='advice_for_freshers_004', 1 chunk(s), tokens=114
   • 111/231 => doc_id='advice_for_freshers_005', 1 chunk(s), tokens=140
   • 112/231 => doc_id='quick_tips_for_freshers_001', 1 chunk(s), tokens=183
   • 113/231 => doc_id='international_students_cultural_adaptation_001', 1 chunk(s), tokens=98
   • 114/231 => doc_id='international_students_cultural_adaptation_00

   • 195/231 => doc_id='wellbeing_support_from_student_life_014', 1 chunk(s), tokens=109
   • 196/231 => doc_id='wellbeing_support_from_student_life_015', 1 chunk(s), tokens=87
   • 197/231 => doc_id='wellbeing_support_from_student_life_016', 1 chunk(s), tokens=101
   • 198/231 => doc_id='wellbeing_support_from_student_life_017', 1 chunk(s), tokens=56
   • 199/231 => doc_id='wellbeing_support_from_student_life_018', 1 chunk(s), tokens=117
   • 200/231 => doc_id='join_wlv_student_life_connect_for_24_7_online_support_019', 1 chunk(s), tokens=88
   • 201/231 => doc_id='join_wlv_student_life_connect_for_24_7_online_support_020', 1 chunk(s), tokens=93
   • 202/231 => doc_id='join_wlv_student_life_connect_for_24_7_online_support_021', 1 chunk(s), tokens=62
   • 203/231 => doc_id='weekly_mental_health_and_wellbeing_workshops_at_university_of_wolverhampton_022', 1 chunk(s), tokens=72
   • 204/231 => doc_id='weekly_mental_health_and_wellbeing_workshops_at_university_of_wolverhampton_023', 1 chu

In [9]:
!pip install tiktoken



In [None]:
pip show openai
