***MongoDB Schema Vector Set up***

link: https://cloud.mongodb.com/

**Loading packages, libraries and secrets into notebook**

In [None]:
%run Setup.ipynb

In [None]:
# In Google Colab, the Google Drive can be mounted as follows to access documents
#from google.colab import drive
#drive.mount('/content/drive')

In [4]:
# Upload the dataset and transform to dataframe
# Define the dataset path
dataset_path = "DB_schema_testing.csv"
print("Dataset Path:", dataset_path)

# Check if the file exists at the specified path
if not os.path.isfile(dataset_path):
    raise FileNotFoundError(f"Unable to find the file at {dataset_path}")

# Load the dataset
dataset = load_dataset('csv', data_files=dataset_path)

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset["train"])

# Print a few rows to verify
print(dataset_df.head())

Dataset Path: DB_schema_testing.csv


Generating train split: 0 examples [00:00, ? examples/s]

   Column_name  Table_name DB_name            Lookup_name
0       CONTID  CONTINENTS   car_1      CONTID CONTINENTS
1    CONTINENT  CONTINENTS   car_1   CONTINENT CONTINENTS
2    COUNTRYID   COUNTRIES   car_1    COUNTRYID COUNTRIES
3  COUNTRYNAME   COUNTRIES   car_1  COUNTRYNAME COUNTRIES
4    CONTINENT   COUNTRIES   car_1    CONTINENT COUNTRIES


In [5]:
# Setting the embedding model and getting the embeddings for the dataframe
embedding_model = SentenceTransformer("thenlper/gte-large")
def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()
dataset_df["embedding"] = dataset_df["Lookup_name"].apply(get_embedding)

In [6]:
# MongoDB setup
client = MongoClient(MONGO_URI_Schema_All)
dbName = "MVector"
collectionName = "MTSchemaAll"
collection = client[dbName][collectionName]
index_name = "vector_index_schema_all"

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [7]:
# Delete any existing records in the collection before loading the new data
collection.delete_many({})

DeleteResult({'n': 0, 'electionId': ObjectId('7fffffff00000000000000b4'), 'opTime': {'ts': Timestamp(1726496420, 25), 't': 180}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1726496420, 25), 'signature': {'hash': b'\xc1\xca\x10\x9ck\x87\xd4\x82@g\xda`i\xe5\x16\xe6\xe05\xff\xe2', 'keyId': 7351804200415657986}}, 'operationTime': Timestamp(1726496420, 25)}, acknowledged=True)

In [8]:
# Insert the documents into the collection
documents = dataset_df.to_dict("records")
collection.insert_many(documents)
print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed
