# Phase 2: Phase 2 (Vector DB / RAG setup) - [with Pinecone & GPT-3]

### Step 1: Set up Pinecone

In [18]:
!pip install pinecone
!pip install python-dotenv

Collecting pinecone
  Using cached pinecone-8.0.0-py3-none-any.whl.metadata (11 kB)
Collecting certifi>=2019.11.17 (from pinecone)
  Using cached certifi-2025.11.12-py3-none-any.whl.metadata (2.5 kB)
Collecting orjson>=3.0.0 (from pinecone)
  Using cached orjson-3.11.4-cp313-cp313-macosx_15_0_arm64.whl.metadata (41 kB)
Collecting pinecone-plugin-assistant<4.0.0,>=3.0.1 (from pinecone)
  Using cached pinecone_plugin_assistant-3.0.1-py3-none-any.whl.metadata (30 kB)
Collecting pinecone-plugin-interface<0.1.0,>=0.0.7 (from pinecone)
  Using cached pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting typing-extensions>=3.7.4 (from pinecone)
  Using cached typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting urllib3>=1.26.5 (from pinecone)
  Using cached urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting packaging<25.0,>=24.2 (from pinecone-plugin-assistant<4.0.0,>=3.0.1->pinecone)
  Using cached packaging-24.2-py3-none-any.whl.metadata (3

### Step 2: Create/Setup/Connect a Pinecone Index

In [19]:
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

# Load .env
load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV", "us-east-1")  # default to us-east-1

# Create Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

# Index name
index_name = "youtube-chunks"

# Create index if it doesn't exist
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region=PINECONE_ENV
        )
    )

# Connect to the index
index = pc.Index(index_name)

print(f"‚úÖ Pinecone index '{index_name}' ready ‚Äî region: {PINECONE_ENV}")


‚úÖ Pinecone index 'youtube-chunks' ready ‚Äî region: us-east-1


### Step 3: Embed the chunks from the JSON dataset and Upsert them into Pinecone using OpenAI embeddings

In [23]:
import os
import json
from time import sleep
import openai
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

# ------------------------------
# Load environment variables
# ------------------------------
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV", "us-east-1")
MODEL = "gpt-3.5-turbo"

# ------------------------------
# Initialize OpenAI client
# ------------------------------
openai.api_key = OPENAI_API_KEY

# ------------------------------
# Initialize Pinecone client
# ------------------------------
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "youtube-chunks"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region=PINECONE_ENV)
    )

index = pc.Index(index_name)
print(f"‚úÖ Pinecone index '{index_name}' ready ‚Äî region: {PINECONE_ENV}")

# ------------------------------
# Load your RAG dataset
# ------------------------------
dataset_path = "../output/rag_dataset.json"
with open(dataset_path, "r", encoding="utf-8") as f:
    dataset = json.load(f)

# ------------------------------
# Function: Summarize a text chunk
# ------------------------------
def summarize_chunk(text):
    prompt = f"Summarize the following text in 1-2 sentences, keeping key details:\n\n{text}"
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1
    )
    return response.choices[0].message.content.strip()

# ------------------------------
# Upsert chunks with precomputed summaries
# ------------------------------
for i, item in enumerate(dataset):
    try:
        # Generate embedding
        embedding_response = openai.embeddings.create(
            model="text-embedding-3-small",
            input=item["text_chunk"]
        )
        embedding = embedding_response.data[0].embedding

        # Precompute summary
        summary = summarize_chunk(item["text_chunk"])

        # Upsert into Pinecone with summary in metadata
        index.upsert(
            vectors=[{
                "id": str(i),
                "values": embedding,
                "metadata": {
                    "video_title": item["video_title"],
                    "url": item["url"],
                    "start_time": item["start_time"],
                    "end_time": item["end_time"],
                    "text_chunk": item["text_chunk"],
                    "summary": summary
                }
            }]
        )

        if (i + 1) % 10 == 0:
            print(f"Upserted {i + 1}/{len(dataset)} chunks...")
            sleep(0.1)  # prevent rate limit

    except Exception as e:
        print(f"‚ùå Failed for chunk {i}: {e}")

print(f"üéâ All {len(dataset)} chunks upserted into Pinecone with summaries!")


‚úÖ Pinecone index 'youtube-chunks' ready ‚Äî region: us-east-1
Upserted 10/34 chunks...
Upserted 20/34 chunks...
Upserted 30/34 chunks...
üéâ All 34 chunks upserted into Pinecone with summaries!


# üß™ TESTING:

In [21]:
from openai import OpenAI

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

query = "How do I create a smart contract for a blockchain game?"
query_embedding = client.embeddings.create(
    model="text-embedding-3-small",
    input=query
).data[0].embedding

results = index.query(
    vector=query_embedding,
    top_k=3,
    include_metadata=True
)

for match in results.matches:
    print(f"{match.metadata['video_title']} ({match.metadata['start_time']}-{match.metadata['end_time']})")
    print(match.metadata['text_chunk'])
    print("------")


How to Build a Blockchain Game Using ChatGPT! (00:03:12.239-00:04:41.270)
deploy the smart contracts to a local test blockchain uh which it has already uh helped us to download and walk it through with a step-by-step build the pawn game you guys already know it can do this it could design it and code it connect the game uh with the blockchain so this is going to require us to implement the front encode to interact with the ethereum blockchain this should include functions to authenticate users query the nfts deposit nfts into the wagering smart contract and withdraw the winning so once again it already knows everything it needs to do and we'll do this for us and then also we have uh I have to create a user interface for wallet interaction test the game to play the game and then finally promote the game but there you guys go and like literally if you remove some of these pointless ones like maybe promoting the game and learning the basics here uh this is a nine step process to create an