In [12]:
import os
from dotenv import load_dotenv
load_dotenv()

True

### Pinecone Documentation [DOCS](https://docs.pinecone.io/home)
### Get started [Quickstart](https://docs.pinecone.io/guides/get-started/quickstart)

## Establish a connection to MongoDB

In [55]:
from pymongo import MongoClient

client = MongoClient(os.getenv('CONNECTION_STRING'))

db = client['DeepLearningCluster']  
collection = db['Violations-1']


## Retrieve all documents from Violations_detected collection

In [56]:
documents = collection.find()

In [57]:
inputs = list(documents)

In [76]:
str(inputs[0]['_id'])

str

In [79]:
data = []
for input in inputs:
    text = f"A {input['vehicle_type']} was detected with this license plate number {input['license_plate_number']}{input['violation_type']} in {input['street_name']} road at {input['time']} on {input['date']} at these coordinates: {input['latitude']}, {input['longitude']}"
    data.append({'id': str(input['_id']), 'text': text})


In [80]:
data[2]

{'id': '66f3cdeb03cab8b36d109d14',
 'text': 'A car was detected with this license plate number nanovertaking in طريق المطار road at 20:44:53 on 2024-09-10 at these coordinates: 24.916405, 46.723884'}

In [68]:
print(type(data[0]))
print(type(inputs))

<class 'dict'>
<class 'list'>


## initialize a Pinecone client

In [81]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.getenv('pinecone_API_KEY'))

## create new serverless index

In [17]:
# Create Index
index_name = "violation-data5"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

index = pc.Index(index_name)


## create vector embedings

In [82]:
import openai
openai.api_key = os.getenv('OPENAI_API_KEY') 


def embed(docs: list[str]) -> list[list[float]]:
    res = openai.embeddings.create(
        input=docs,
        model="text-embedding-3-large"
    )
    doc_embeds = [r.embedding for r in res.data] 
    return doc_embeds 


doc_embeds = embed([d["text"] for d in data])

## upsert the data

In [83]:
inputs[0]

{'_id': ObjectId('66f3cdeb03cab8b36d109d12'),
 'date': '2024-09-10',
 'time': '14:38:35',
 'license_plate_number': '1063 ~ad_',
 'vehicle_type': 'car',
 'violation_type': 'overtaking',
 'latitude': 24.921224,
 'longitude': 46.722133,
 'street_name': 'طريق المطار'}

In [84]:
vectors = []
# f"A {inputs[i]['vehicle_type']} was detected {inputs[i]['violation_type']}
#  in {inputs[i]['street_name']} road at {inputs[i]['time']} on {inputs[i]['date']} 
#  at these coordinates: {inputs[i]['latitude']}, {inputs[i]['longitude']}
for d, e in zip(inputs, doc_embeds):
    vectors.append({
        "id": str(d['_id']),
        "values": e,
        "metadata": {'vehicle_type': d['vehicle_type'], 'violation_type': d['violation_type'], 'street_name': d['street_name'], 'time': d['time'], 'date': d['date'], 'latitude': d['latitude'], 'longitude': d['longitude']}
    })

index.upsert(
    vectors=vectors,
    namespace="violation-data-day1"
)


{'upserted_count': 74}

## check index status

In [12]:
print(index.describe_index_stats())

{'dimension': 3072,
 'index_fullness': 0.0,
 'namespaces': {'violation-data-day1': {'vector_count': 74}},
 'total_vector_count': 74}


### Query

In [15]:
query = "مخالفة برقم اللوحة 7862"

x = embed([query])

results = index.query(
    namespace="violation-data-day1",
    vector=x[0],
    top_k=3,
    include_values=False,
    include_metadata=True,
    
)

print(results)

{'matches': [{'id': '66f22b85895db7f65810cabb',
              'metadata': {'date': '2024-09-22',
                           'latitude': 46.666884,
                           'longitude': 24.828788,
                           'street_name': 'انس بن مالك',
                           'time': '20:45:09',
                           'vehicle_type': 'car',
                           'violation_type': 'overtaking'},
              'score': 0.356940389,
              'values': []},
             {'id': '66f22b85895db7f65810ca96',
              'metadata': {'date': '2024-09-22',
                           'latitude': 46.666884,
                           'longitude': 24.828788,
                           'street_name': 'انس بن مالك',
                           'time': '20:45:09',
                           'vehicle_type': 'car',
                           'violation_type': 'overtaking'},
              'score': 0.355448663,
              'values': []},
             {'id': '66f22b85895db7f65810ca9d'