In [55]:
import os
from dotenv import load_dotenv
load_dotenv()

True

### Pinecone Documentation [DOCS](https://docs.pinecone.io/home)
### Get started [Quickstart](https://docs.pinecone.io/guides/get-started/quickstart)

## Establish a connection to MongoDB

In [56]:
from pymongo import MongoClient

client = MongoClient(os.getenv('CONNECTION_STRING'))

db = client['DeepLearningCluster']  
collection = db['Violations_detected']  


## Retrieve all documents from Violations_detected collection

In [57]:
documents = collection.find()

In [58]:
inputs = [d for d in documents]

data = []
for i in range(len(inputs)):
    text = f"A {inputs[i]['vehicle_type']} was detected {inputs[i]['violation_type']} in {inputs[i]['street_name']} road at {inputs[i]['time']} on {inputs[i]['date']} at these coordinates: {inputs[i]['latitude']}, {inputs[i]['longitude']}"
    data.append({'id': str(i+1), 'text': text})


## initialize a Pinecone client

In [60]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.getenv('pinecone_API_KEY'))

## create new serverless index

In [61]:
# Create Index
index_name = "violation-data3"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

index = pc.Index(index_name)


In [62]:
data[136]

{'id': '137',
 'text': 'A car was detected overtaking in الحارث بن الحباب road at 14:09:17 on 2024-09-21 at these coordinates: 24.6877, 46.7219'}

## create vector embedings

In [64]:
import openai
openai.api_key = os.getenv('OPENAI_API_KEY') 


def embed(docs: list[str]) -> list[list[float]]:
    res = openai.embeddings.create(
        input=docs,
        model="text-embedding-3-large"
    )
    doc_embeds = [r.embedding for r in res.data] 
    return doc_embeds 


doc_embeds = embed([d["text"] for d in data])

## upsert the data

In [67]:
inputs[0]

{'_id': ObjectId('66eeb404c466dc07daa4dc96'),
 'date': '2024-09-21',
 'time': '14:02:44',
 'license_plate_number': nan,
 'vehicle_type': 'car',
 'violation_type': 'overtaking',
 'longitude': 46.7219,
 'latitude': 24.6877,
 'street_name': 'الحارث بن الحباب'}

In [69]:
vectors = []
# f"A {inputs[i]['vehicle_type']} was detected {inputs[i]['violation_type']}
#  in {inputs[i]['street_name']} road at {inputs[i]['time']} on {inputs[i]['date']} 
#  at these coordinates: {inputs[i]['latitude']}, {inputs[i]['longitude']}
for d, e in zip(inputs, doc_embeds):
    vectors.append({
        "id": str(d['_id']),
        "values": e,
        "metadata": {'vehicle_type': d['vehicle_type'], 'violation_type': d['violation_type'], 'street_name': d['street_name'], 'time': d['time'], 'date': d['date'], 'latitude': d['latitude'], 'longitude': d['longitude']}
    })

index.upsert(
    vectors=vectors,
    namespace="violation-data-day1"
)


{'upserted_count': 139}

## check index status

In [175]:
print(index.describe_index_stats())

{'dimension': 3072,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 140}},
 'total_vector_count': 140}


### Query

In [None]:
query = "كم عدد المخالفات التي تم رصدها اليوم؟"

x = embed([query])

results = index.query(
    namespace="violation-data-day1",
    vector=x[0],
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)