In [None]:
from dotenv import load_dotenv
load_dotenv() 
import pinecone
from pinecone import Pinecone, ServerlessSpec
import openai
import time
from openai import OpenAI
import os
import json
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings

In [None]:
# Initialize Pinecone
pc = Pinecone(api_key="YOUR_API_KEY")

# Create a Pinecone index
pc.create_index(
    name="rag",
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)
time.sleep(1)


In [None]:
data = json.load(open("reviews.json"))
data['reviews']

In [None]:
import requests 
model_name = "sentence-transformers/all-MiniLM-L6-v2"
base_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2"
headers = {"Authorization": f"Bearer os.getenv('HUGGINGFACE_API_KEY')"}
embed_model = "text-embedding-3-small"
processed_data = []  # Initialize the processed_data list
def embed(texts):
    response = requests.post(url = base_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    return response.json()

for review in data["reviews"]:
    embedding = embed(review["review"])
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )
processed_data[0]

In [None]:
# Insert the embeddings into the Pinecone index

index = pc.Index("rag")

# Insert the embeddings into the Pinecone index
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
    dimension=384,  # Add this line to ensure the dimension matches the index
)
print(f"Upserted count: {upsert_response['upserted_count']}")

# Print index statistics
print(index.describe_index_stats())
