***MongoDB Schema Vector Set up***

link: https://cloud.mongodb.com/

**Loading packages, libraries and secrets into notebook**

In [1]:
# Importing the required libraries
from pymongo import MongoClient
from sentence_transformers import SentenceTransformer # https://huggingface.co/thenlper/gte-large
import os
from dotenv import load_dotenv
from datasets import load_dataset
import pandas as pd
from pymongo.mongo_client import MongoClient

  from tqdm.autonotebook import tqdm, trange





In [2]:
# Accessing the secrets from the environment variables
load_dotenv()
MONGO_URI_schema = os.getenv("MONGO_URI_Schema")
HF_Token = os.getenv("HF_TOKEN")

In [3]:
# Upload the dataset and transform to dataframe
# Define the dataset path
dataset_path = "DB_schema_testing.csv"
print("Dataset Path:", dataset_path)

# Check if the file exists at the specified path
if not os.path.isfile(dataset_path):
    raise FileNotFoundError(f"Unable to find the file at {dataset_path}")

# Load the dataset
dataset = load_dataset('csv', data_files=dataset_path)

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset["train"])

# Print a few rows to verify
print(dataset_df.head())

Dataset Path: DB_schema_testing.csv
   Column_name  Table_name DB_name                  Lookup_name
0       CONTID  CONTINENTS   car_1      car_1 CONTINENTS CONTID
1    CONTINENT  CONTINENTS   car_1   car_1 CONTINENTS CONTINENT
2    COUNTRYID   COUNTRIES   car_1    car_1 COUNTRIES COUNTRYID
3  COUNTRYNAME   COUNTRIES   car_1  car_1 COUNTRIES COUNTRYNAME
4    CONTINENT   COUNTRIES   car_1    car_1 COUNTRIES CONTINENT


In [4]:
# Setting the embedding model and getting the embeddings for the dataframe
embedding_model = SentenceTransformer("thenlper/gte-large")
def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()
dataset_df["embedding"] = dataset_df["Lookup_name"].apply(get_embedding)

In [5]:
# MongoDB setup
client = MongoClient(MONGO_URI_schema)
dbName = "MVector"
collectionName = "MTSchemaAll"
collection = client[dbName][collectionName]
index_name = "vector_index_schema_all"

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [6]:
# Delete any existing records in the collection before loading the new data
collection.delete_many({})

DeleteResult({'n': 488, 'electionId': ObjectId('7fffffff00000000000000b7'), 'opTime': {'ts': Timestamp(1727437251, 492), 't': 183}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1727437251, 499), 'signature': {'hash': b'\xd8q\\L\xb8\x15^.\xbc\xa7\x89\xe0\xb9\xab`\xb5\x0e\x81/^', 'keyId': 7385925018342916097}}, 'operationTime': Timestamp(1727437251, 492)}, acknowledged=True)

In [7]:
# Insert the documents into the collection
documents = dataset_df.to_dict("records")
collection.insert_many(documents)
print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed
