**MongoDB Vector Set up**

link: https://cloud.mongodb.com/

In [None]:
# Installing the required packages
%pip install datasets pandas pymongo sentence_transformers python-dotenv
%pip install -U transformers
#install below if using GPU
%pip install accelerate

In [None]:
# Importing the required libraries
import os
from dotenv import load_dotenv
from datasets import load_dataset
import pandas as pd
from sentence_transformers import SentenceTransformer # https://huggingface.co/thenlper/gte-large
from pymongo.mongo_client import MongoClient

In [None]:
# In Google Colab, you can use the following code to access the secret
from google.colab import userdata
MONGO_URI = userdata.get('MONGO_URI')

# In your local environment, you can use the following code to access the secret
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")

In [None]:
# In Google Colab, the Google Drive can be mounted as follows to access documents
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Upload the dataset and transform to dataframe
# Define the dataset path
dataset_path = "/content/drive/MyDrive/ColabNotebooks/SpiderTrain_complete.csv"
print("Dataset Path:", dataset_path)

# Check if the file exists at the specified path
if not os.path.isfile(dataset_path):
    raise FileNotFoundError(f"Unable to find the file at {dataset_path}")

# Load the dataset
dataset = load_dataset('csv', data_files=dataset_path)

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset["train"])

# Print a few rows to verify
print(dataset_df.head())

In [None]:
# Setting the embedding model and getting the embeddings for the dataframe
embedding_model = SentenceTransformer("thenlper/gte-large")
def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()
dataset_df["embedding"] = dataset_df["Query"].apply(get_embedding)

In [None]:
# Test if MOngoDB is connected
uri = userdata.get(MONGO_URI)

# Create a new client and connect to the server
client = MongoClient(uri)

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

In [None]:
# Delete any existing records in the collection before loading the new data
collection.delete_many({})

In [None]:
# Insert the documents into the collection
documents = dataset_df.to_dict("records")
collection.insert_many(documents)
print("Data ingestion into MongoDB completed")