In [20]:
VECTOR_INDEX_NAME = "title_body_vector_index"
INDEX_FIELD = "title_body_embedding"

In [5]:
from dotenv import load_dotenv
import os
import pymongo


load_dotenv()
# connect to your Atlas cluster
client = pymongo.MongoClient(os.getenv("MONGODB_URI"))

db = client["blogdb"]

collection = db["ai_news"]

print(collection.find_one())


{'_id': ObjectId('667d1ef6fc45eb48396a6a06'), 'date': datetime.datetime(2024, 6, 25, 21, 49), 'title': "Boston scientists create AI model to 'catch Alzheimer's disease early'", 'body': "Researchers say they've created a promising AI model that predicts the likelihood of someone developing Alzheimer's early. The Boston University researchers on Tuesday announced that they designed the new artificial intelligence computer program — which identifies those with mild cognitive impairment who are likely to develop Alzheimer's within six years.", 'url': 'https://www.msn.com/en-us/health/other/boston-scientists-create-ai-model-to-catch-alzheimer-s-disease-early/ar-BB1oT292', 'image': 'https://img-s-msn-com.akamaized.net/tenant/amp/entityid/BB1oSZDb.img?w=4000&h=2609&m=4&q=79', 'source': 'Tribune News Service on MSN.com', 'found_at': datetime.datetime(2024, 6, 26, 9, 51, 56, 553000)}


# Add vectors to existing data

In [13]:
MODEL = "text-embedding-3-small"
DIMENSIONS = 256


In [18]:
from openai import OpenAI

Embedding = list[float]

client = OpenAI()


def get_embeddings_batch(texts: list[str], dimensions: int) -> list[Embedding]:
    assert len(texts) <= 2048, "The number of texts should be less than 2048"

    response = client.embeddings.create(input=texts, model=MODEL, dimensions=dimensions)

    return [x.embedding for x in response.data]

def get_embedding(text: str, dimensions: int) -> Embedding:
    return get_embeddings_batch([text], dimensions)[0]


In [17]:
from typing import List, Dict, Any
from pymongo.operations import UpdateOne
from openai import OpenAI

from openai import OpenAI

client = OpenAI()

# Define batch size
BATCH_SIZE: int = 1000


def process_batch(batch: List[Dict[str, Any]]) -> None:
    # Extract text from documents (combining title and body)
    texts: List[str] = [f"{doc['title']} - {doc['body']}" for doc in batch]

    # Generate embeddings
    embeddings: List[Embedding] = get_embeddings_batch(texts, dimensions=DIMENSIONS)

    # Prepare bulk update operations
    bulk_operations: List[UpdateOne] = []
    for doc, embedding in zip(batch, embeddings):
        bulk_operations.append(
            UpdateOne(
                {"_id": doc["_id"]}, {"$set": {"title_body_embedding": embedding}}
            )
        )

    # Execute bulk update
    if bulk_operations:
        collection.bulk_write(bulk_operations)


def update_embeddings(limit: int | None = None) -> None:
    # Query for documents without title_body_embedding
    query = {
        "$or": [
            {"title_body_embedding": {"$exists": False}},
            {"title_body_embedding": None},
            {"title_body_embedding": []},
        ]
    }

    # Get total number of documents to update, respecting the limit
    total_docs: int = (
        min(collection.count_documents(query), limit)
        if limit
        else collection.count_documents(query)
    )
    processed_docs: int = 0

    while processed_docs < total_docs:
        # Calculate remaining documents to process
        remaining_docs = total_docs - processed_docs
        current_batch_size = min(BATCH_SIZE, remaining_docs)

        # Fetch a batch of documents
        cursor = collection.find(query, {"title": 1, "body": 1}).limit(
            current_batch_size
        )
        batch: List[Dict[str, Any]] = list(cursor)

        if not batch:
            break

        # Process the batch
        process_batch(batch)

        processed_docs += len(batch)
        print(f"Processed {processed_docs}/{total_docs} documents")


update_embeddings()

Processed 1000/13901 documents
Processed 2000/13901 documents
Processed 3000/13901 documents
Processed 4000/13901 documents
Processed 5000/13901 documents
Processed 6000/13901 documents
Processed 7000/13901 documents
Processed 8000/13901 documents
Processed 9000/13901 documents
Processed 10000/13901 documents
Processed 11000/13901 documents
Processed 12000/13901 documents
Processed 13000/13901 documents
Processed 13901/13901 documents


# Rag

In [55]:
QUERY = "AI is good at math"

query_embedding = get_embedding(QUERY, dimensions=DIMENSIONS)

LIMIT = 10
NUM_CANDIDATES = 20 * LIMIT # should be between 10 and 20 times the limit

# Sample vector search pipeline
pipeline = [
   {
      "$vectorSearch": {
            "index":VECTOR_INDEX_NAME,
            "queryVector": query_embedding,
            "path": INDEX_FIELD,
            "numCandidates": 100,
            "limit": 5
      }
   },
   {
      "$project": {
         "_id": 0,
         "title": 1,
         "date": 1,
         "url": 1,
         "body": 1,
         "score": {
            "$meta": "vectorSearchScore"
         }
      }
   }
]

results = collection.aggregate(pipeline)

for i in results:
   print(f"Title: {i['title']}\nDate: {i['date']} \nURL: {i['url']}\nScore: {i['score']}\n\n{i['body']}\n\n\n\n")

Title: AI Will Become Mathematicians' 'Co-Pilot'
Date: 2024-06-08 00:00:00 
URL: https://www.scientificamerican.com/article/ai-will-become-mathematicians-co-pilot/
Score: 0.8364213705062866

You even talked about getting that factor down to less than one. With AI, there's a real potential of doing that. I think in the future, instead of typing up our proofs, we would explain them to ...




Title: AI isn't dumb, but it might be dumber than you think
Date: 2024-06-25 16:30:00 
URL: https://www.msn.com/en-au/news/techandscience/ai-isn-t-dumb-but-it-might-be-dumber-than-you-think/ar-BB1oS8xW
Score: 0.831683874130249

I recently couldn't remember the name of a pastry and tried describing it to three different AI chatbots. It didn't work. (The pastry was a frangipane tart.) Disappointments like those don't mean AI is useless.




Title: AI isn't dumb, but it might be dumber than you think
Date: 2024-06-25 16:30:00 
URL: https://www.msn.com/en-gb/news/techandscience/ai-isn-t-dumb-but-it-migh