In [None]:
!pip install pymongo

In [None]:
!pip install numpy
!pip install pandas

In [None]:
import os
import json
import numpy as np
import pandas as pd
from pymongo import MongoClient

For loading data into mongodb

In [None]:

#mongo db config
def get_mongo_client(uri="mongodb://127.0.0.1:27017/"):

    return MongoClient(uri)

def get_database(client, db_name="admin"):

    return client[db_name]

def get_collection(db, collection_name):

    return db[collection_name]

In [None]:
BASE_DIR = "Data"  #where the extracted root data folder

#Mongo DB collections
Collections = { 'Benchmark MD', 
                'LLM overall info',
                'Licenses MD',
                'Organizations',
                'Providers'
                }

In [None]:
def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

In [None]:
def load_single_json_to_mongo(mongo_uri, db_name, collection_name, folder_dir, id_col):
    """
    Load JSON data including only 1 DATA POINT from a folder into MongoDB.

    Parameters:
        mongo_uri (str): MongoDB connection URI
        db_name (str): Name of the database
        collection_name (str): Name of the collection to insert into
        folder_dir (str): Path to the folder containing JSON files
    """
    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]

    print(f"Reading from: {folder_dir}")
    count = 0

    for file in os.listdir(folder_dir):
        if not file.endswith(".json"):
            continue

        json_path = os.path.join(folder_dir, file)
        data = load_json(json_path)

        collection.update_one(
            {id_col: data[id_col]},  # avoid duplicates
            {"$set": data},
            upsert=True
        )

        print(f"Inserted: {file}")
        count += 1

    print(f"\n{count} json files loaded into '{collection_name}' collection.")



In [None]:
def load_many_json_to_mongo(mongo_uri, db_name, collection_name, folder_dir, id_col):
    """
    Load JSON data including list of data points from a folder into MongoDB.

    Parameters:
        mongo_uri (str): MongoDB connection URI
        db_name (str): Name of the database
        collection_name (str): Name of the collection to insert into
        folder_dir (str): Path to the folder containing JSON files
    """
    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]

    print(f"Reading from: {folder_dir}")
    count = 0

    for file in os.listdir(folder_dir):
        if not file.endswith(".json"):
            continue

        json_path = os.path.join(folder_dir, file)
        data_list = load_json(json_path)  # this is a list []

        if not isinstance(data_list, list):
            print(f"{file} is not a list, skipping...")
            continue

        for item in data_list:
            collection.update_one(
                {id_col: item[id_col]},
                {"$set": item},
                upsert=True
            )
            count += 1

    print(f"\n{count} json files loaded into '{collection_name}' collection.")



In [None]:
def load_child_folder_json(mongo_uri, db_name, collection_name, root_dir, child_name, id_col):
    """
    Load master data from each child folder inside root_dir.

    """

    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]

    print(f"Scanning root directory: {root_dir}")
    count = 0

    for child in os.listdir(root_dir):
        child_path = os.path.join(root_dir, child)

        if not os.path.isdir(child_path):
            continue

        child_file = os.path.join(child_path, child_name)

        if not os.path.exists(child_file):
            print(f"No {child_name} in: {child_path}")
            continue

        with open(child_file, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Insert based on its unique ID
        collection.update_one(
            {id_col: data[id_col]},
            {"$set": data},
            upsert=True
        )

        print(f"Inserted: {child}")
        count += 1

    print(f"\nLoaded {count} {child} files into '{collection_name}'")


In [None]:
def load_llm_models(mongo_uri, db_name, collection_name, organizations_dir, id_col):
    """
    Load benchmarks.json (a list of objects) from:
        organizations/<org_id>/models/<model_id>/benchmarks.json
    """

    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]

    print(f"Scanning organizations directory: {organizations_dir}")
    total_items = 0
    total_files = 0

    for org_folder in os.listdir(organizations_dir):
        org_path = os.path.join(organizations_dir, org_folder)
        if not os.path.isdir(org_path):
            continue

        models_dir = os.path.join(org_path, "models")
        if not os.path.exists(models_dir):
            print(f"No models folder in: {org_folder}")
            continue

        for model_folder in os.listdir(models_dir):
            model_path = os.path.join(models_dir, model_folder)
            if not os.path.isdir(model_path):
                continue

            json_file = os.path.join(model_path, "benchmarks.json")
            if not os.path.exists(json_file):
                print(f"No benchmarks.json in: {model_path}")
                continue

            # Load list JSON
            data_list = load_json(json_file)

            if not isinstance(data_list, list):
                print(f"{json_file} is not a list, skipping...")
                continue

            total_files += 1

            # Insert each item
            for item in data_list:

                if id_col not in item:
                    print(f"Missing '{id_col}' in item inside {json_file}, skipping...")
                    continue

                # Upsert each benchmark record
                collection.update_one(
                    {id_col: item[id_col]},
                    {"$set": item},
                    upsert=True
                )

                total_items += 1

    print(f"\nLoaded {total_items} benchmark records from {total_files} files into '{collection_name}'")


In [None]:
MONGO_URI = 'mongodb://127.0.0.1:27017/admin2' #this matches in .env
DB_NAME = 'admin2' #MongoDB 

In [None]:
from pathlib import Path
ROOT_DIR = Path(r"C:\Users\nguye\OneDrive\Desktop\Data")

LICENSE_DIR = ROOT_DIR / "licenses"
BENCHMARK_DIR =  ROOT_DIR / "benchmarks"
OVERALL_INFO_DIR =  ROOT_DIR / "llm_comparison_dataset.csv"
ORGANIZATIONS_DIR =  ROOT_DIR / "organizations"
PROVIDERS_DIR =  ROOT_DIR / "providers"


In [None]:
#LOAD BENCHMARK METADATA
COLLECTION_NAME = 'Benchmark MD'
load_single_json_to_mongo(MONGO_URI, DB_NAME, COLLECTION_NAME, BENCHMARK_DIR, 'benchmark_id')

In [None]:
#LOAD LICENSES METADATA
COLLECTION_NAME = 'Licenses MD'
load_single_json_to_mongo(MONGO_URI, DB_NAME, COLLECTION_NAME, LICENSE_DIR, 'license_id')

In [None]:
#LOAD LLM OVERALL INFO
COLLECTION_NAME = 'LLM overall info'
df = pd.read_csv(OVERALL_INFO_DIR)

data_dict = df.to_dict(orient='records')

client = MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]


if data_dict:
    collection.insert_many(data_dict)
    print(f"Inserted {len(data_dict)} records into '{COLLECTION_NAME}' collection.")
else:
    print("CSV file is empty. No data inserted.")

In [None]:
#LOAD LLMS FROM ORGANIZATION OVERALL INFO
COLLECTION_NAME = 'LLMs in Organizations'
load_many_json_to_mongo(MONGO_URI, DB_NAME, COLLECTION_NAME, ORGANIZATIONS_DIR, 'model_id')

In [None]:
#LOAD LLM SCORE FOR EACH BENCHMARK
COLLECTION_NAME = 'LLM Performance'
load_llm_models(MONGO_URI, DB_NAME, COLLECTION_NAME, ORGANIZATIONS_DIR, 'model_benchmark_id')

In [None]:
#LOAD LLMS FROM PROVIDER OVERALL INFO
COLLECTION_NAME = 'LLMs in Providers'
load_many_json_to_mongo(MONGO_URI, DB_NAME, COLLECTION_NAME, PROVIDERS_DIR, 'provider.json','model_id')

In [None]:
#LOAD LLMS FROM PROVIDER OVERALL INFO
COLLECTION_NAME = 'Providers MD'
load_child_folder_json(MONGO_URI, DB_NAME, COLLECTION_NAME, PROVIDERS_DIR, 'provider.json','provider_id')

In [None]:
#LOAD LLMS FROM PROVIDER OVERALL INFO
COLLECTION_NAME = 'Organizations MD'
load_child_folder_json(MONGO_URI, DB_NAME, COLLECTION_NAME, ORGANIZATIONS_DIR, 'organization.json', 'organization_id')