***MongoDB Vector Set up***

link: https://cloud.mongodb.com/

**Loading packages, libraries and secrets into notebook**

In [2]:
%run Setup.ipynb


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
# In Google Colab, the Google Drive can be mounted as follows to access documents
#from google.colab import drive
#drive.mount('/content/drive')

In [4]:
# Upload the dataset and transform to dataframe
# Define the dataset path
dataset_path = "SpiderTrain_complete.csv"
print("Dataset Path:", dataset_path)

# Check if the file exists at the specified path
if not os.path.isfile(dataset_path):
    raise FileNotFoundError(f"Unable to find the file at {dataset_path}")

# Load the dataset
dataset = load_dataset('csv', data_files=dataset_path)

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset["train"])

# Print a few rows to verify
print(dataset_df.head())

Dataset Path: SpiderTrain_complete.csv


Generating train split: 0 examples [00:00, ? examples/s]

                                               Query  \
0         SELECT count(*) FROM head WHERE age  >  56   
1  SELECT name ,  born_state ,  age FROM head ORD...   
2  SELECT creation ,  name ,  budget_in_billions ...   
3  SELECT max(budget_in_billions) ,  min(budget_i...   
4  SELECT avg(num_employees) FROM department WHER...   

                                            Question  
0  How many heads of the departments are older th...  
1  List the name, born state and age of the heads...  
2  List the creation year, name and budget of eac...  
3  What are the maximum and minimum budget of the...  
4  What is the average number of employees of the...  


In [5]:
# Setting the embedding model and getting the embeddings for the dataframe
embedding_model = SentenceTransformer("thenlper/gte-large")
def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()
dataset_df["embedding"] = dataset_df["Query"].apply(get_embedding)

In [6]:
# MongoDB setup
client = MongoClient(MONGO_URI_SQL)
dbName = "MVector"
collectionName = "MTSQL"
collection = client[dbName][collectionName]
index_name = "vector_index_sql"

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [None]:
# Delete any existing records in the collection before loading the new data
collection.delete_many({})

In [7]:
# Insert the documents into the collection
documents = dataset_df.to_dict("records")
collection.insert_many(documents)
print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed
