In [1]:
from dotenv import load_dotenv
load_dotenv()
import os 
import unicodedata
import re
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm
import hashlib


  from tqdm.autonotebook import tqdm


In [2]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [3]:
import json
data = json.load(open("food.json"))
data['addresses']

[{'name': 'Ashoka Grill',
  'address': '1436 Flatbush Avenue',
  'town': 'Brooklyn',
  'state': 'NY',
  'region': 'Brooklyn',
  'typeOfFood': ['INDIAN'],
  'rating': 5.0},
 {'name': 'Lutheran Halal Café',
  'address': '5121 2nd Avenue',
  'town': 'Brooklyn',
  'state': 'NY',
  'region': 'Brooklyn',
  'typeOfFood': ['MIDDLE EASTERN'],
  'rating': 5.0},
 {'name': 'Istanbul Park',
  'address': '293 7th Avenue',
  'town': 'Brooklyn',
  'state': 'NY',
  'region': 'Brooklyn',
  'typeOfFood': ['MEDITERRANEAN', 'TURKISH'],
  'rating': 5.0},
 {'name': 'Kennedy Fried Chicken',
  'address': '1519 Fulton Street',
  'town': 'Brooklyn',
  'state': 'NY',
  'region': 'Brooklyn',
  'typeOfFood': ['AMERICAN'],
  'rating': 5.0},
 {'name': 'Sakman Deli & Grill',
  'address': '6821 4th Avenue',
  'town': 'Brooklyn',
  'state': 'NY',
  'region': 'Brooklyn',
  'typeOfFood': ['AMERICAN'],
  'rating': 5.0},
 {'name': 'Kashmir Grill',
  'address': '816 Coney Island Avenue',
  'town': 'Brooklyn',
  'state': 'NY'

In [4]:
processed_data = []
processed_ids = set()
client = OpenAI()

def create_unique_id(name, address):
    # Combine name and address, then create a hash
    combined = f"{name}|{address}"
    ascii_combined = unicodedata.normalize('NFKD', combined).encode('ASCII', 'ignore').decode()
    clean_combined = re.sub(r'\W+', '_', ascii_combined).strip('_').lower()
    
    # Create a short hash to append to the ID
    hash_object = hashlib.md5(clean_combined.encode())
    short_hash = hash_object.hexdigest()[:8]
    
    return f"{clean_combined}_{short_hash}"

# Process the data
for restaurant in tqdm(data['addresses'], desc="Processing restaurants"):
    unique_id = create_unique_id(restaurant["name"], restaurant["address"])
    
    # Check if the ID has already been processed
    if unique_id in processed_ids:
        continue  # Skip this restaurant if it's a duplicate

    restaurant_info = f"{restaurant['name']} is a {', '.join(restaurant['typeOfFood'])} restaurant located at {restaurant['address']}, {restaurant['town']}, {restaurant['state']}. It has a rating of {restaurant['rating']}."
    
    response = client.embeddings.create(
        input=restaurant_info,
        model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    
    processed_data.append({
        "id": unique_id,
        "values": embedding,
        "metadata": {
            "name": restaurant["name"],
            "address": restaurant["address"],
            "town": restaurant["town"],
            "state": restaurant["state"],
            "region": restaurant["region"],
            "typeOfFood": restaurant["typeOfFood"],
            "rating": restaurant["rating"]
        }
    })
    
    # Add the ID to the set of processed IDs
    processed_ids.add(unique_id)

Processing restaurants: 100%|██████████| 301/301 [01:24<00:00,  3.57it/s]


In [5]:
processed_data[0]

{'id': 'ashoka_grill_1436_flatbush_avenue_ace9e692',
 'values': [-0.024914304,
  -0.021084607,
  -0.034552373,
  0.009047658,
  -0.009228505,
  0.0035903405,
  0.012276305,
  -0.04633933,
  0.015467719,
  -0.044977657,
  0.030658849,
  -0.05599867,
  0.036297012,
  -0.020861208,
  0.04523297,
  0.036382116,
  -0.0052817897,
  -0.025212169,
  -0.015616652,
  0.004741909,
  -0.0014135303,
  -0.0044520223,
  0.043360677,
  0.016191106,
  0.025127064,
  0.027361054,
  -0.026658943,
  -0.015116664,
  0.012169925,
  -0.01769107,
  -0.022637762,
  -0.0054174247,
  -0.0028642938,
  -0.0035424693,
  -0.009302971,
  -0.026361078,
  0.013797546,
  -0.057019927,
  0.058339044,
  -0.02389305,
  0.023063283,
  -0.0013842757,
  0.017988935,
  0.0545519,
  0.008467885,
  -0.024510058,
  -0.023935603,
  0.040062882,
  -0.008031725,
  0.0010844158,
  -0.011244414,
  0.014669865,
  -0.017286824,
  -0.04016926,
  0.00097205146,
  0.007888111,
  0.034020472,
  0.013680527,
  0.056211434,
  0.03272263,
  -0

In [6]:
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 250}

In [9]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 250}},
 'total_vector_count': 250}