if your existing dataset is in csv format, you can convert it to mongodb using following python script

In [1]:
import csv
from pymongo import MongoClient

# Function to read CSV file and convert it to a list of dictionaries
def read_csv(filename):
    data = []
    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            data.append(dict(row))
    return data

# Function to insert data into MongoDB
def insert_into_mongodb(data, collection_name):
    client = MongoClient('mongodb://localhost:27017/')  # Connect to MongoDB
    db = client['dripcheck']  # Specify your database name
    collection = db[collection_name]  # Specify your collection name
    collection.insert_many(data)  # Insert data into collection

# Replace 'your_csv_file.csv' with your CSV file name and 'your_collection_name' with the desired collection name
csv_data = read_csv('processed_dataset.csv')
insert_into_mongodb(csv_data, 'products')


as mongodb already provides an id to products, i removed the ids that i assigned to them via this code:

In [1]:
from pymongo import MongoClient

# Replace the following with your MongoDB connection details
MONGO_URI = "mongodb://localhost:27017/"
DATABASE_NAME = "dripcheck"
COLLECTION_NAME = "products"

# Connect to MongoDB
client = MongoClient(MONGO_URI)
db = client[DATABASE_NAME]
collection = db[COLLECTION_NAME]

# Use the update_many method with $unset to remove the "id" field
result = collection.update_many({}, {"$unset": {"id": ""}})

# Print the result
print(f"Modified {result.modified_count} documents to remove the 'id' field.")

# Close the connection
client.close()


Modified 3180 documents to remove the 'id' field.


Now there's another problem with our database, the price and the year are in string, we want to convert them to integer:

In [4]:
from pymongo import MongoClient

# Replace the following with your MongoDB connection details
MONGO_URI = "mongodb://localhost:27017/"
DATABASE_NAME = "dripcheck"
COLLECTION_NAME = "products"

# Connect to MongoDB
client = MongoClient(MONGO_URI)
db = client[DATABASE_NAME]
collection = db[COLLECTION_NAME]

# Update price to numeric data type (float)
result_price = collection.update_many({}, [{"$set": {"price": {"$convert": {"input": "$price", "to": "double", "onError": 0}}}}])

# Update year to numeric data type (integer)
result_year = collection.update_many({}, [{"$set": {"year": {"$toInt": "$year"}}}])

# Print the result
print(f"Modified {result_price.modified_count} documents to convert 'price' to numeric.")
print(f"Modified {result_year.modified_count} documents to convert 'year' to numeric.")

# Close the connection
client.close()

Modified 3180 documents to convert 'price' to numeric.
Modified 3180 documents to convert 'year' to numeric.


we can also extract the values of masterCategory, subCategory and articleType in txt file from code down below:

In [5]:
from pymongo import MongoClient

# Replace the following with your MongoDB connection details
MONGO_URI = "mongodb://localhost:27017/"
DATABASE_NAME = "dripcheck"
COLLECTION_NAME = "products"

# Connect to MongoDB
client = MongoClient(MONGO_URI)
db = client[DATABASE_NAME]
collection = db[COLLECTION_NAME]

# Get distinct values for the required fields
master_categories = collection.distinct("masterCategory")
sub_categories = collection.distinct("subCategory")
article_types = collection.distinct("articleType")

# Write the distinct values to a text file
with open('categories.txt', 'w') as file:
    file.write("Master Categories:\n")
    for item in master_categories:
        file.write(f"{item}\n")
    
    file.write("\nSub Categories:\n")
    for item in sub_categories:
        file.write(f"{item}\n")
    
    file.write("\nArticle Types:\n")
    for item in article_types:
        file.write(f"{item}\n")

# Close the connection
client.close()


In [6]:
from pymongo import MongoClient
from bson.objectid import ObjectId

# Replace the following with your MongoDB connection details
MONGO_URI = "mongodb://localhost:27017/"
DATABASE_NAME = "dripcheck"
PRODUCTS_COLLECTION_NAME = "products"
CATEGORY_COLLECTION_NAME = "categories"

# Connect to MongoDB
client = MongoClient(MONGO_URI)
db = client[DATABASE_NAME]
products_collection = db[PRODUCTS_COLLECTION_NAME]
category_collection = db[CATEGORY_COLLECTION_NAME]

# Helper function to get or create a category and return its ObjectId
def get_or_create_category(name):
    category = category_collection.find_one({"name": name})
    if not category:
        result = category_collection.insert_one({"name": name})
        return result.inserted_id
    return category["_id"]

# Get distinct categories
master_categories = products_collection.distinct("masterCategory")
sub_categories = products_collection.distinct("subCategory")

# Ensure all distinct categories are present in the Category collection
master_category_ids = {name: get_or_create_category(name) for name in master_categories}
sub_category_ids = {name: get_or_create_category(name) for name in sub_categories}

# Update the products collection with ObjectId references for masterCategory and subCategory
for product in products_collection.find():
    master_category_id = master_category_ids.get(product["masterCategory"])
    sub_category_id = sub_category_ids.get(product["subCategory"])
    
    products_collection.update_one(
        {"_id": product["_id"]},
        {"$set": {
            "masterCategory": master_category_id,
            "subCategory": sub_category_id
        }}
    )

# Close the connection
client.close()
