In [168]:
import os
from dotenv import load_dotenv
load_dotenv()

True

### Pinecone Documentation [DOCS](https://docs.pinecone.io/home)
### Get started [Quickstart](https://docs.pinecone.io/guides/get-started/quickstart)

## Establish a connection to MongoDB

In [169]:
from pymongo import MongoClient

client = MongoClient(os.getenv('CONNECTION_STRING'))

db = client['DeepLearningCluster']  
collection = db['Violations_detected']  


## Retrieve all documents from Violations_detected collection

In [170]:
documents = collection.find()

inputs = [d for d in documents]

data = [{'id': '0', 'text': ' '}]
for i in range(len(inputs)):
    text = f"A {inputs[i]['vehicle_type']} was detected {inputs[i]['violation_type']} in {inputs[i]['street_name']} road at {inputs[i]['time']} on {inputs[i]['date']} at these coordinates: {inputs[i]['latitude']}, {inputs[i]['longitude']}"
    data.append({'id': str(i+1), 'text': text})


## initialize a Pinecone client

In [171]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.getenv('pinecone_API_KEY'))

## create new serverless index

In [146]:
# Create Index
index_name = "violation-data2"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

index = pc.Index(index_name)


In [172]:
data[1]['text']

'A car was detected overtaking in الحارث بن الحباب road at 14:02:44 on 2024-09-21 at these coordinates: 24.6877, 46.7219'

## create vector embedings

In [173]:
import openai
openai.api_key = os.getenv('OPENAI_API_KEY') 


def embed(docs: list[str]) -> list[list[float]]:
    res = openai.embeddings.create(
        input=docs,
        model="text-embedding-3-large"
    )
    doc_embeds = [r.embedding for r in res.data] 
    return doc_embeds 


doc_embeds = embed([d["text"] for d in data])

## upsert the data

In [174]:
vectors = []
for d, e in zip(data, doc_embeds):
    vectors.append({
        "id": d['id'],
        "values": e,
        "metadata": {'text': d['text']}
    })

index.upsert(
    vectors=vectors,
    namespace="violation-data-day1"
)


{'upserted_count': 140}

## check index status

In [175]:
print(index.describe_index_stats())

{'dimension': 3072,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 140}},
 'total_vector_count': 140}


### Query

In [176]:
query = "كم عدد المخالفات التي تم رصدها اليوم؟"

x = embed([query])

results = index.query(
    namespace="ns1",
    vector=x[0],
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)

{'matches': [{'id': '128',
              'metadata': {'text': 'A car was detected overtaking in الحارث بن '
                                   'الحباب road at 14:08:41 on 2024-09-21 at '
                                   'these coordinates: 24.6877, 46.7219'},
              'score': 0.344149321,
              'values': []},
             {'id': '108',
              'metadata': {'text': 'A car was detected overtaking in الحارث بن '
                                   'الحباب road at 14:07:20 on 2024-09-21 at '
                                   'these coordinates: 24.6877, 46.7219'},
              'score': 0.344048321,
              'values': []},
             {'id': '112',
              'metadata': {'text': 'A car was detected overtaking in الحارث بن '
                                   'الحباب road at 14:07:40 on 2024-09-21 at '
                                   'these coordinates: 24.6877, 46.7219'},
              'score': 0.343803793,
              'values': []}],
 'namespace': 'ns1