In [7]:
pip install pinecone==6.0.2

Collecting pinecone==6.0.2
  Using cached pinecone-6.0.2-py3-none-any.whl.metadata (9.0 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone==6.0.2)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Using cached pinecone-6.0.2-py3-none-any.whl (421 kB)
Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone

   -------------------- ------------------- 1/2 [pinecone]
   -------------------- ------------------- 1/2 [pinecone]
   -------------------- ------------------- 1/2 [pinecone]
   -------------------- ------------------- 1/2 [pinecone]
   -------------------- ------------------- 1/2 [pinecone]
   -------------------- ------------------- 1/2 [pinecone]
   -------------------- ------------------- 1/2 [pinecone]
   -------------------- ------------------- 1/2 [pinecone]
   -------------------- ------------------- 1/2 [pinecone]
   -------------------- ------

In [8]:
import os
from  pinecone import Pinecone

# Get your API key at app.pinecone.io
api_key = os.environ.get("PINECONE_API_KEY")




  from .autonotebook import tqdm as notebook_tqdm


In [9]:
pc = Pinecone(api_key=api_key)

In [None]:
index_name = "swagger-docs"

In [12]:
from pinecone import ServerlessSpec, CloudProvider, AwsRegion, Metric

pc.create_index(
    name=index_name,
    metric=Metric.COSINE,
    dimension=384,
    spec=ServerlessSpec(cloud=CloudProvider.AWS, region=AwsRegion.US_EAST_1),
)

{
    "name": "hello-pinecone",
    "metric": "cosine",
    "host": "hello-pinecone-r0zzlv1.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [13]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Pinecone as LangchainPinecone
from langchain_text_splitters import RecursiveJsonSplitter
import json
from langchain.schema import Document
import os
from dotenv import load_dotenv

load_dotenv()

# Import Pinecone
import pinecone

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
print(f"Pinecone API Key: {PINECONE_API_KEY}")

# Load and process JSON
with open('./swagger.json', 'r') as f:
    json_data = json.load(f)

# Split the raw JSON
splitter = RecursiveJsonSplitter(max_chunk_size=300)
json_chunks = splitter.split_json(json_data=json_data)

print("First 3 chunks:")
for i in range(0, min(3, len(json_chunks))):
    print(f"Chunk {i}: {json_chunks[i]}")
    print("+++++++")

# Create documents
documents = []
for i, chunk in enumerate(json_chunks):
    text = json.dumps(chunk, indent=2)
    doc = Document(
        page_content=text,
        metadata={
            "chunk_id": i,
            "source": "./swagger.json",
            "keys": list(chunk.keys()) if isinstance(chunk, dict) else []
        }
    )
    documents.append(doc)

print(f"Created {len(documents)} documents")

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)
print("Embeddings model loaded")


Pinecone API Key: pcsk_3sAEUh_DgPkMm6aWnydMM9MtNiSxLiHMXQxWd8xQSjMuixyJ1P5GDnAfhaydrqCzKsfEhB
First 3 chunks:
Chunk 0: {'openapi': '3.0.3', 'info': {'title': 'Aviation Industry - Fake API Collection', 'version': '1.0.0', 'description': 'A large collection (100 endpoints) of fake aviation-related APIs with example responses for testing and demo purposes.'}}
+++++++
Chunk 1: {'servers': [{'url': 'https://api.fake-aviation.example.com', 'description': 'Demo server'}], 'paths': {'/api/v1/aircraft/list': {'get': {'summary': 'List aircraft'}}}}
+++++++
Chunk 2: {'paths': {'/api/v1/aircraft/list': {'get': {'responses': {'200': {'description': 'Array of aircraft', 'content': {'application/json': {'schema': {'type': 'array', 'items': {'$ref': '#/components/schemas/Aircraft'}}, 'examples': {'list': {'$ref': '#/components/examples/AircraftListExample'}}}}}}}}}}
+++++++
Created 185 documents
Embeddings model loaded


In [14]:
description = pc.describe_index(name=index_name)
description

{
    "name": "hello-pinecone",
    "metric": "cosine",
    "host": "hello-pinecone-r0zzlv1.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [19]:

from langchain.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone
import os

# 1️⃣ Load your text chunks
chunks = ["This is first chunk", "Second chunk here", "Third one..."]

# 2️⃣ Initialize the embeddings model
embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 3️⃣ Generate embeddings (each is a list of floats)
embeddings = embed_model.embed_documents(chunks)

print(len(embeddings))          # should be 3
print(len(embeddings[0]))       # embedding dimension, e.g., 384

# 4️⃣ Connect to Pinecone
api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=api_key)
index = pc.Index(host=description.host)

# 5️⃣ Format data for upsert
formatted = [
    {"id": f"chunk-{i}", "values": emb, "metadata": {"text": chunks[i]}}
    for i, emb in enumerate(embeddings)
]

# 6️⃣ Upsert to Pinecone
index.upsert(vectors=formatted)
print("✅ Embeddings successfully inserted!")


  embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


3
384
✅ Embeddings successfully inserted!
