# Pinecone Setup

In [36]:
import pinecone
from pinecone import Pinecone, ServerlessSpec
import os
import config
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

In [2]:
pc = Pinecone(api_key = config.PINECONE_API_KEY, environment = config.PINECONE_ENV)

In [9]:
pc.list_indexes()

[
    {
        "name": "sample-db",
        "metric": "cosine",
        "host": "sample-db-pw4wyt0.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 3,
        "deletion_protection": "disabled",
        "tags": null
    }
]

In [19]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} successfully deleted.")
else:
     print(f"{index_name} not in index list.")       

sample-db successfully deleted.


In [12]:
pc.list_indexes()

[]

## Creating an Index

In [17]:
index_name = "sample-db"
dimension = 3
metric = "cosine"

In [20]:
pc.create_index(
    name=index_name,
    dimension=dimension,
    metric=metric,
    spec={
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    }
)

{
    "name": "sample-db",
    "metric": "cosine",
    "host": "sample-db-pw4wyt0.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 3,
    "deletion_protection": "disabled",
    "tags": null
}

In [26]:
index_name_2 = "sample-db-2"
dimension_2 = 1536
metric_2 = "cosine"

In [27]:
pc.create_index(
    name=index_name_2,
    dimension=dimension_2,
    metric=metric_2,
    spec={
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    }
)

{
    "name": "sample-db-2",
    "metric": "cosine",
    "host": "sample-db-2-pw4wyt0.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1536,
    "deletion_protection": "disabled",
    "tags": null
}

In [28]:
pc.list_indexes()

[
    {
        "name": "sample-db",
        "metric": "cosine",
        "host": "sample-db-pw4wyt0.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 3,
        "deletion_protection": "disabled",
        "tags": null
    },
    {
        "name": "sample-db-2",
        "metric": "cosine",
        "host": "sample-db-2-pw4wyt0.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 1536,
        "deletion_protection": "disabled",
        "tags": null
    }
]

## Upserting data to Pinecone DB

In [29]:
index = pc.Index(name = index_name)

vectors = [
    ("dog",     [0.4, 0.2, 0.9]),
    ("cat",     [0.3, 0.1, 0.8]),
    ("car",     [0.9, 0.8, 0.1]),
    ("apple",   [0.1, 0.9, 0.2])
]

index.upsert(vectors)

{'upserted_count': 4}

In [41]:
fw = load_dataset(
    "HuggingFaceFW/fineweb",
    name="sample-10BT",
    split="train",
    streaming=True
)

Resolving data files:   0%|          | 0/27468 [00:00<?, ?it/s]

| Part                          | Meaning                                                                                                          |
| ----------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
| **`load_dataset`**            | A function that loads data from Hugging Face.                                                                           |
| **`"HuggingFaceFW/fineweb"`** | The name of the dataset you want (called *FineWeb*, made by *HuggingFaceFW*).                                           |
| **`name="sample-10BT"`**      | Which version of the dataset — here, the *10 billion token sample* (a smaller part of the huge FineWeb dataset).        |
| **`split="train"`**           | Which section of the data — here, the *training* part.                                                                  |
| **`streaming=True`**          | Important! It doesn’t download the whole dataset; it loads small pieces as you use it (so it saves RAM and disk space). |
| **`fw = ...`**                | You’re saving the loaded dataset in a variable called `fw`.                                                             |

---

#### In plain English:

This line means:

> “Load the FineWeb 10-billion-token training dataset from Hugging Face, but don’t download it all — just stream it piece by piece as I need it.”

---

In [38]:
fw

IterableDataset({
    features: ['text', 'id', 'dump', 'url', 'date', 'file_path', 'language', 'language_score', 'token_count'],
    num_shards: 15
})

In [39]:
fw.features

{'text': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None),
 'dump': Value(dtype='string', id=None),
 'url': Value(dtype='string', id=None),
 'date': Value(dtype='string', id=None),
 'file_path': Value(dtype='string', id=None),
 'language': Value(dtype='string', id=None),
 'language_score': Value(dtype='float64', id=None),
 'token_count': Value(dtype='int64', id=None)}

In [42]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [43]:
pc.create_index(
    name="text",
    dimension=model.get_sentence_embedding_dimension(),
    metric="cosine",
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

{
    "name": "text",
    "metric": "cosine",
    "host": "text-pw4wyt0.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [44]:
index = pc.Index(name = "text")

In [45]:
# Define the number of items you want to process (subset size)
subset_size = 10000  # We'll only process 10,000 items from the streamed dataset
                     # (so we don’t overload memory or API limits)

# Create a list to hold all the vectors we’ll upload (upsert) to Pinecone
vectors_to_upsert = []

# Go through the dataset (fw) one item at a time
for i, item in enumerate(fw):
    # Stop after reaching the subset size (10,000)
    if i >= subset_size:
        break

    # Extract the text from the dataset item
    text = item['text']

    # Each item should have a unique ID (convert it to string)
    unique_id = str(item['id'])

    # Extract language info (optional, depends on dataset fields)
    language = item['language']

    # Convert the text into an embedding (a list of numbers)
    # The model outputs a vector that represents the meaning of the text
    embedding = model.encode(text, show_progress_bar=False).tolist()

    # Add extra info as metadata (can help for filtering later)
    metadata = {'language': language}

    # Save a tuple of (id, vector, metadata) to the list
    vectors_to_upsert.append((unique_id, embedding, metadata))

# -------------------------------
# Now we upload all vectors to Pinecone in batches
# (to avoid sending too much data at once)
# -------------------------------

batch_size = 1000  # Upload 1,000 vectors per request

# Loop through the collected data in groups of 1,000
for i in range(0, len(vectors_to_upsert), batch_size):
    # Pick the current batch
    batch = vectors_to_upsert[i:i + batch_size]

    # Send the batch to Pinecone (upsert = insert or update)
    index.upsert(vectors=batch)

print("Subset of data upserted to Pinecone index.")

Subset of data upserted to Pinecone index.
