In [22]:
from dotenv import load_dotenv
import os
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
load_dotenv()

True

In [7]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="hunter-clubs",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [19]:
import json
data = json.load(open("mockData.json"))
data["clubs"]

[{'name': 'Tech Innovators Club',
  'description': 'A community for students passionate about technology and innovation. We work on tech projects, attend hackathons, and host tech talks.',
  'club_type': 'Technology',
  'media': {'Instagram': 'https://www.instagram.com/techinnovators',
   'Website': 'https://www.techinnovatorsclub.com'}},
 {'name': 'Eco Warriors',
  'description': 'Dedicated to promoting environmental sustainability on campus. We organize clean-up drives, tree-planting events, and workshops on eco-friendly practices.',
  'club_type': 'Environment',
  'media': {'Instagram': 'https://www.instagram.com/ecowarriors',
   'Facebook': 'https://www.facebook.com/ecowarriorsclub'}},
 {'name': 'Creative Writing Society',
  'description': 'A club for aspiring writers to share their work, receive feedback, and participate in writing challenges. We also publish an annual literary magazine.',
  'club_type': 'Literature',
  'media': {'Instagram': 'https://www.instagram.com/creativewri

In [29]:
openai = OpenAI()
processed_data = []

for club in data["clubs"]:
    embedding_response = openai.embeddings.create(
        model="text-embedding-3-small",
        input=club["description"]
    )
    embedding = embedding_response.data[0].embedding

    media_list = []
    if "Instagram" in club["media"]:
        media_list.append(club["media"]["Instagram"])
    if "Facebook" in club["media"]:
        media_list.append(club["media"]["Facebook"])
    if "Website" in club["media"]:
        media_list.append(club["media"]["Website"])

    processed_data.append({
        "values": embedding,
        "id": club["name"],
        "metadata": {
            "club_type": club["club_type"],
            "description": club["description"],
            "media": media_list
        }
    })

In [30]:
processed_data[0]

{'values': [-0.012553092,
  -0.014054958,
  -0.020024873,
  0.032815762,
  0.017571826,
  -0.031088615,
  -0.0016833409,
  0.046858203,
  -0.021251395,
  -0.0113766305,
  -0.01834779,
  0.028760724,
  -0.0546679,
  -0.025318949,
  -0.020275183,
  0.008685788,
  -0.019223878,
  -0.037696823,
  0.0072026965,
  0.029286377,
  -0.008216456,
  -0.018585585,
  0.045080993,
  0.0032164953,
  0.015106264,
  -0.0006699728,
  -0.030813273,
  0.05371672,
  0.026057366,
  0.0023091182,
  0.058022067,
  -0.01581965,
  -0.022790808,
  0.013366602,
  0.027709417,
  0.0052596577,
  0.0006332084,
  0.014017411,
  0.012703279,
  0.015606885,
  -0.03171439,
  -0.024930967,
  0.021526739,
  0.010951103,
  0.013979864,
  -0.0502374,
  -0.013291509,
  -0.009111317,
  0.02983706,
  0.047233667,
  -0.042953353,
  -0.06392941,
  0.056370016,
  0.019674437,
  0.016132537,
  -0.014017411,
  0.013917287,
  0.012747083,
  -0.024680655,
  0.0106006665,
  0.018560553,
  0.008498055,
  0.045131058,
  0.011276507,
  0

In [33]:
index = pc.Index("hunter-clubs")
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [35]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}