In [1]:
!pip install sentence-transformers pinecone-client pandas

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
from sentence_transformers import SentenceTransformer

# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

2024-11-28 10:12:58.981082: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-28 10:12:59.003392: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-28 10:12:59.003411: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-28 10:12:59.004177: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-28 10:12:59.008731: I tensorflow/core/platform/cpu_feature_guar

In [25]:
def create_pinecone_index(api_key, environment, csv_file, index_name, embedding_model):
    """
    Create and populate a Pinecone index from a given CSV file, with a progress bar.

    Args:
    - api_key (str): Pinecone API key.
    - environment (str): Pinecone environment.
    - csv_file (str): Path to the CSV file containing video metadata.
    - index_name (str): Name of the Pinecone index to create or update.
    - embedding_model (SentenceTransformer): Preloaded SentenceTransformer model.

    Returns:
    - None
    """
    from pinecone import Pinecone, ServerlessSpec, PineconeApiException
    import pandas as pd
    from datetime import datetime
    from tqdm import tqdm  # Import tqdm for the progress bar

    # Initialize Pinecone
    pc = Pinecone(api_key=api_key, environment=environment)

    try:
        # Check if the index exists, and create it if not
        if index_name not in pc.list_indexes():
            print(f"Index '{index_name}' does not exist. Creating a new index...")
            # Create the index with required specs (dimension and metric)
            spec = ServerlessSpec(cloud="aws", region=environment)  # Use the supported region
            pc.create_index(index_name, dimension=384, metric="cosine", spec=spec)
        else:
            print(f"Index '{index_name}' already exists. Skipping creation and updating the index.")
    
    except PineconeApiException as e:
        # Handle the "ALREADY_EXISTS" exception gracefully
        if "ALREADY_EXISTS" in str(e):
            print(f"Index '{index_name}' already exists. Skipping creation and continuing to populate data.")
        else:
            raise  # Re-raise other exceptions if not related to index existence

    index = pc.Index(index_name)

    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Preprocess the "Watched At" column to extract dates
    def preprocess_date(row):
        try:
            return datetime.strptime(row["Watched At"], "%b %d, %Y").date()
        except Exception:
            return None

    df["Watched Date"] = df.apply(preprocess_date, axis=1)

    # Handle NaN values in DataFrame
    df.fillna("", inplace=True)

    # Populate Pinecone index with a progress bar
    print(f"Populating Pinecone index '{index_name}'...")
    for i, row in tqdm(df.iterrows(), total=len(df), desc="Processing documents", unit="doc"):
        # Prepare the input string for embedding
        video_data = f"Title: {row['Title']}, Category: {row['Category']}, Watched at: {row['Watched At']}"
        vector = embedding_model.encode(video_data)
        
        # Upsert data into the Pinecone index
        index.upsert([(row['Video ID'], vector, {
            "Title": row["Title"],
            "Category": row["Category"],
            "Watched At": row["Watched At"],
            "Watched Date": str(row["Watched Date"]),
            "Video Link": row["Video Link"]
        })])

    print(f"Pinecone index '{index_name}' has been created and populated successfully.: {csv_file}")

In [49]:
# Define Pinecone credentials and parameters
api_key = "85e39b43-9316-4d8b-b684-eb46542c34ef"
environment = "us-east-1"
csv_file = "/app/yt_watch_history_gmail_7000.csv"
index_name = "youtube-data-index"

# Call the function to create and populate the index
create_pinecone_index(api_key, environment, csv_file, index_name, embedding_model)

Index 'youtube-data-index' does not exist. Creating a new index...
Index 'youtube-data-index' already exists. Skipping creation and continuing to populate data.
Populating Pinecone index 'youtube-data-index'...


Processing documents: 100%|██████████| 977/977 [02:15<00:00,  7.19doc/s]

Pinecone index 'youtube-data-index' has been created and populated successfully.: /app/yt_watch_history_gmail_7000.csv



