In [None]:
from pymongo import MongoClient
import os
from sentence_transformers import SentenceTransformer
import math

In [17]:
def normalize(vector):
    norm = math.sqrt(sum(x*x for x in vector))

    if norm == 0:
        return vector
    return [x/norm for x in vector] 

In [None]:
MONGODB_URI = os.getenv("MONGODB_URI")
client = MongoClient(MONGODB_URI)

collection = client["bbc_news"]["news_articles"]

model_path = "./model"
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

model.save(model_path)

model = SentenceTransformer(model_path)

In [19]:
def get_embedding(text): # pyright: ignore[reportUnknownParameterType]
    return model.encode(text).tolist()

In [20]:
filter = {
    '$and': [
        {'title': {'$exists': True, "$nin": [None, ""]}},
        {'description': {'$exists': True, "$nin": [None, ""]}},
        { 'embeddings': {'$exists': False}}       
    ]
}

updated_doc_count = 0

for document in collection.find(filter).limit(20):
    text = document['description']
    embedding = normalize(get_embedding(text))
    collection.update_one({'_id': document['_id']},{ "$set": { 'embeddings': embedding } }, upsert=True)
    updated_doc_count += 1

print("Documents updated: {}".format(updated_doc_count))

Documents updated: 20


In [21]:
from pymongo.operations import SearchIndexModel

# Create your index model, then create the search index
search_index_model = SearchIndexModel(
  definition = {
    "fields": [
      {
        "type": "vector",
        "numDimensions": 1024,
        "path": "embeddings",
        "similarity": "cosine"
      }
    ]
  },
  name = "vector_index",
  type = "vectorSearch" 
)
collection.create_search_index(model=search_index_model)


'vector_index'

In [22]:
def get_query_results(query):
    query_embedding = get_embedding(query)

    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embeddings", 
                "exact": True,
                "limit": 20
            }
        }, {
            "$project": {
                "_id": 0,
                "title": 1,
                "link": 1,
                "score": {
                    "$meta": "vectorSearchScore"
                }
            }
        }
    ]

    results = collection.aggregate(pipeline)

    array_of_results = []
    for doc in results:
        array_of_results.append(doc)
    return array_of_results

In [None]:
from gpt4all import GPT4All
local_llm = GPT4All("mistral-7b-openorca.gguf2.Q4_0.gguf")


Downloading: 100%|██████████| 4.11G/4.11G [01:30<00:00, 45.2MiB/s]
Verifying: 100%|██████████| 4.11G/4.11G [00:07<00:00, 585MiB/s]


In [None]:
question = "Can you please get any news articles about Queen Elizabeth?"
documents = get_query_results(question)
text_documents = ""
for doc in documents:
    summary = doc.get("title", "")
    description = doc.get("description", "")
    string = f"Summary: {summary} Link: {description}. \n"
    text_documents += string
prompt = f"""Use the following pieces of context to answer the question at the end.
    {text_documents}
    Question: {question}
"""
response = local_llm.generate(prompt)
cleaned_response = response.replace('\\n', '\n')
print(cleaned_response)