In [2]:
%pip install pymongo numpy pandas ipykernel

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import json
import numpy as np
import pandas as pd
from pymongo import MongoClient

For loading data into mongodb

In [4]:

#mongo db config
def get_mongo_client(uri="mongodb://127.0.0.1:27017/"):

    return MongoClient(uri)

def get_database(client, db_name="admin"):

    return client[db_name]

def get_collection(db, collection_name):

    return db[collection_name]

In [5]:
BASE_DIR = "Data"  #where the extracted root data folder

#Mongo DB collections
Collections = { 'Benchmark MD', 
                'LLM overall info',
                'Licenses MD',
                'Organizations',
                'Providers'
                }

In [6]:
def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

In [7]:
def load_single_json_to_mongo(mongo_uri, db_name, collection_name, folder_dir, id_col):
    """
    Load JSON data including only 1 DATA POINT from a folder into MongoDB.

    Parameters:
        mongo_uri (str): MongoDB connection URI
        db_name (str): Name of the database
        collection_name (str): Name of the collection to insert into
        folder_dir (str): Path to the folder containing JSON files
    """
    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]

    print(f"Reading from: {folder_dir}")
    count = 0

    for file in os.listdir(folder_dir):
        if not file.endswith(".json"):
            continue

        json_path = os.path.join(folder_dir, file)
        data = load_json(json_path)

        collection.update_one(
            {id_col: data[id_col]},  # avoid duplicates
            {"$set": data},
            upsert=True
        )

        print(f"Inserted: {file}")
        count += 1

    print(f"\n{count} json files loaded into '{collection_name}' collection.")



In [8]:
def load_many_json_to_mongo(mongo_uri, db_name, collection_name, folder_dir, id_col):
    """
    Load JSON data including list of data points from a folder into MongoDB.

    Parameters:
        mongo_uri (str): MongoDB connection URI
        db_name (str): Name of the database
        collection_name (str): Name of the collection to insert into
        folder_dir (str): Path to the folder containing JSON files
    """
    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]

    print(f"Reading from: {folder_dir}")
    count = 0

    for file in os.listdir(folder_dir):
        if not file.endswith(".json"):
            continue

        json_path = os.path.join(folder_dir, file)
        data_list = load_json(json_path)  # this is a list []

        if not isinstance(data_list, list):
            print(f"{file} is not a list, skipping...")
            continue

        for item in data_list:
            collection.update_one(
                {id_col: item[id_col]},
                {"$set": item},
                upsert=True
            )
            count += 1

    print(f"\n{count} json files loaded into '{collection_name}' collection.")



In [9]:
def load_child_folder_json(mongo_uri, db_name, collection_name, root_dir, child_name, id_col):
    """
    Load master data from each child folder inside root_dir.

    """

    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]

    print(f"Scanning root directory: {root_dir}")
    count = 0

    for child in os.listdir(root_dir):
        child_path = os.path.join(root_dir, child)

        if not os.path.isdir(child_path):
            continue

        child_file = os.path.join(child_path, child_name)

        if not os.path.exists(child_file):
            print(f"No {child_name} in: {child_path}")
            continue

        with open(child_file, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Insert based on its unique ID
        collection.update_one(
            {id_col: data[id_col]},
            {"$set": data},
            upsert=True
        )

        print(f"Inserted: {child}")
        count += 1

    print(f"\nLoaded {count} {child} files into '{collection_name}'")


In [10]:
def load_llm_models(mongo_uri, db_name, collection_name, organizations_dir, id_col):
    """
    Load benchmarks.json (a list of objects) from:
        organizations/<org_id>/models/<model_id>/benchmarks.json
    """

    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]

    print(f"Scanning organizations directory: {organizations_dir}")
    total_items = 0
    total_files = 0

    for org_folder in os.listdir(organizations_dir):
        org_path = os.path.join(organizations_dir, org_folder)
        if not os.path.isdir(org_path):
            continue

        models_dir = os.path.join(org_path, "models")
        if not os.path.exists(models_dir):
            print(f"No models folder in: {org_folder}")
            continue

        for model_folder in os.listdir(models_dir):
            model_path = os.path.join(models_dir, model_folder)
            if not os.path.isdir(model_path):
                continue

            json_file = os.path.join(model_path, "benchmarks.json")
            if not os.path.exists(json_file):
                print(f"No benchmarks.json in: {model_path}")
                continue

            # Load list JSON
            data_list = load_json(json_file)

            if not isinstance(data_list, list):
                print(f"{json_file} is not a list, skipping...")
                continue

            total_files += 1

            # Insert each item
            for item in data_list:

                if id_col not in item:
                    print(f"Missing '{id_col}' in item inside {json_file}, skipping...")
                    continue

                # Upsert each benchmark record
                collection.update_one(
                    {id_col: item[id_col]},
                    {"$set": item},
                    upsert=True
                )

                total_items += 1

    print(f"\nLoaded {total_items} benchmark records from {total_files} files into '{collection_name}'")


In [7]:
MONGO_URI = 'mongodb://127.0.0.1:27017/admin2' #this matches in .env
DB_NAME = 'admin2' #MongoDB 

In [12]:
from pathlib import Path
ROOT_DIR = Path(r"/Users/vothao/a-story-of-LLMs-evolution/llm-leaderboard-main")
# Some releases place metadata under a top-level 'data' folder
BASE_DATA_DIR = ROOT_DIR / "data" if (ROOT_DIR / "data").exists() else ROOT_DIR

LICENSE_DIR = BASE_DATA_DIR / "licenses"
BENCHMARK_DIR = BASE_DATA_DIR / "benchmarks"
OVERALL_INFO_DIR = ROOT_DIR / "/Users/vothao/a-story-of-LLMs-evolution/llm_comparison_dataset.csv"
ORGANIZATIONS_DIR = BASE_DATA_DIR / "organizations"
PROVIDERS_DIR = BASE_DATA_DIR / "providers"


In [13]:
#LOAD BENCHMARK METADATA
COLLECTION_NAME = 'Benchmark MD'
if BENCHMARK_DIR.exists() and BENCHMARK_DIR.is_dir():
    load_single_json_to_mongo(MONGO_URI, DB_NAME, COLLECTION_NAME, BENCHMARK_DIR, 'benchmark_id')
else:
    print(f"BENCHMARK_DIR not found: {BENCHMARK_DIR}. Please verify ROOT_DIR or clone the llm-leaderboard-main repository.")

Reading from: /Users/vothao/a-story-of-LLMs-evolution/llm-leaderboard-main/data/benchmarks
Inserted: vqav2.json
Inserted: humaneval-plus.json
Inserted: mmbench-video.json
Inserted: multipl-e.json
Inserted: graphwalks-bfs-<128k.json
Inserted: repobench.json
Inserted: theoremqa.json
Inserted: qasper.json
Inserted: crperelation.json
Inserted: scienceqa-visual.json
Inserted: xstest.json
Inserted: open-rewrite.json
Inserted: gsm8k-chat.json
Inserted: tau-bench-retail.json
Inserted: odinw.json
Inserted: openbookqa.json
Inserted: openai-mrcr%3A-2-needle-256k.json
Inserted: swe-lancer.json
Inserted: mathvista.json
Inserted: mmvetgpt4turbo.json
Inserted: codegolf-v2.2.json
Inserted: livebench-20241125.json
Inserted: mmt-bench.json
Inserted: meld.json
Inserted: androidworld-sr.json
Inserted: omnimath.json
Inserted: mmmlu.json
Inserted: popqa.json
Inserted: mmmu-(val).json
Inserted: egoschema.json
Inserted: swe-bench-multilingual.json
Inserted: writingbench.json
Inserted: erqa.json
Inserted: comp

In [14]:
#LOAD LICENSES METADATA
COLLECTION_NAME = 'Licenses MD'
load_single_json_to_mongo(MONGO_URI, DB_NAME, COLLECTION_NAME, LICENSE_DIR, 'license_id')

Reading from: /Users/vothao/a-story-of-LLMs-evolution/llm-leaderboard-main/data/licenses
Inserted: mistral_research_license_(mrl)_for_research;_mistral_commercial_license_for_commercial_use.json
Inserted: tongyi_qianwen.json
Inserted: llama_3_2_community_license.json
Inserted: llama_3_1_community_license.json
Inserted: gemma.json
Inserted: mnpl_0_1.json
Inserted: creative_commons_attribution_4_0_license.json
Inserted: llama_4_community_license_agreement.json
Inserted: jamba_open_model_license.json
Inserted: qwen.json
Inserted: mit_license.json
Inserted: cc_by_nc.json
Inserted: llama_3_3_community_license_agreement.json
Inserted: unknown.json
Inserted: mit.json
Inserted: deepseek.json
Inserted: mistral_research_license.json
Inserted: mit_+_model_license_(commercial_use_allowed).json
Inserted: health_ai_developer_foundations_terms_of_use.json
Inserted: llama3_2.json
Inserted: proprietary.json
Inserted: apache_2_0.json
Inserted: modified_mit_license.json

23 json files loaded into 'Licens

In [15]:
def load_many_json_to_mongo(mongo_uri, db_name, collection_name, folder_dir, id_col):
    """
    Load JSON data including list of data points from a folder into MongoDB.

    Parameters:
        mongo_uri (str): MongoDB connection URI
        db_name (str): Name of the database
        collection_name (str): Name of the collection to insert into
        folder_dir (str): Path to the folder containing JSON files
    """
    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]

    print(f"Reading from: {folder_dir}")
    count = 0

    for root, _, files in os.walk(folder_dir):
        for file in files:
            if not file != "model.json":
                continue

            json_path = os.path.join(root, file)
            data_list = load_json(json_path)  # this is a list []

            if not isinstance(data_list, list):
                print(f"{file} is not a list, skipping...")
                continue

            for item in data_list:
                collection.update_one(
                    {id_col: item[id_col]},
                    {"$set": item},
                    upsert=True
                )
                count += 1

    print(f"\n{count} json files loaded into '{collection_name}' collection.")

In [16]:
#LOAD LLM OVERALL INFO
COLLECTION_NAME = 'LLM overall info'
df = pd.read_csv(OVERALL_INFO_DIR)

data_dict = df.to_dict(orient='records')

client = MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]


if data_dict:
    collection.insert_many(data_dict)
    print(f"Inserted {len(data_dict)} records into '{COLLECTION_NAME}' collection.")
else:
    print("CSV file is empty. No data inserted.")

Inserted 200 records into 'LLM overall info' collection.


In [17]:
#LOAD LLMS FROM ORGANIZATION OVERALL INFO
COLLECTION_NAME = 'LLMs in Organizations'
load_many_json_to_mongo(MONGO_URI, DB_NAME, COLLECTION_NAME, ORGANIZATIONS_DIR, 'model_id')

Reading from: /Users/vothao/a-story-of-LLMs-evolution/llm-leaderboard-main/data/organizations
organization.json is not a list, skipping...
organization.json is not a list, skipping...
organization.json is not a list, skipping...
organization.json is not a list, skipping...
organization.json is not a list, skipping...
organization.json is not a list, skipping...
organization.json is not a list, skipping...
organization.json is not a list, skipping...
organization.json is not a list, skipping...
organization.json is not a list, skipping...
organization.json is not a list, skipping...
organization.json is not a list, skipping...
organization.json is not a list, skipping...
organization.json is not a list, skipping...
organization.json is not a list, skipping...
organization.json is not a list, skipping...
organization.json is not a list, skipping...

2167 json files loaded into 'LLMs in Organizations' collection.


In [18]:
#LOAD LLM SCORE FOR EACH BENCHMARK
COLLECTION_NAME = 'LLM Performance'
load_llm_models(MONGO_URI, DB_NAME, COLLECTION_NAME, ORGANIZATIONS_DIR, 'model_benchmark_id')

Scanning organizations directory: /Users/vothao/a-story-of-LLMs-evolution/llm-leaderboard-main/data/organizations
No models folder in: unknown
No benchmarks.json in: /Users/vothao/a-story-of-LLMs-evolution/llm-leaderboard-main/data/organizations/mistral/models/mistral-small-2409
No benchmarks.json in: /Users/vothao/a-story-of-LLMs-evolution/llm-leaderboard-main/data/organizations/openai/models/o3-pro-2025-06-10

Loaded 2167 benchmark records from 157 files into 'LLM Performance'


In [19]:
#LOAD LLMS FROM PROVIDER OVERALL INFO
COLLECTION_NAME = 'LLMs in Providers'
load_many_json_to_mongo(MONGO_URI, DB_NAME, COLLECTION_NAME, PROVIDERS_DIR,'model_id')

Reading from: /Users/vothao/a-story-of-LLMs-evolution/llm-leaderboard-main/data/providers
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...
provider.json is not a list, skipping...

232 json files loaded into 'LLMs in Providers' collection.


In [20]:
#LOAD LLMS FROM PROVIDER OVERALL INFO
COLLECTION_NAME = 'Providers MD'
load_child_folder_json(MONGO_URI, DB_NAME, COLLECTION_NAME, PROVIDERS_DIR, 'provider.json','provider_id')

Scanning root directory: /Users/vothao/a-story-of-LLMs-evolution/llm-leaderboard-main/data/providers
Inserted: cohere
Inserted: sambanova
Inserted: together
Inserted: azure
Inserted: hyperbolic
Inserted: deepinfra
Inserted: google
Inserted: fireworks
Inserted: groq
Inserted: novita
Inserted: deepseek
Inserted: xai
Inserted: bedrock
Inserted: lambda
Inserted: replicate
Inserted: anthropic
Inserted: cerebras
Inserted: zeroeval
Inserted: mistral
Inserted: openai

Loaded 20 openai files into 'Providers MD'


In [21]:
#LOAD LLMS FROM PROVIDER OVERALL INFO
COLLECTION_NAME = 'Organizations MD'
load_child_folder_json(MONGO_URI, DB_NAME, COLLECTION_NAME, ORGANIZATIONS_DIR, 'organization.json', 'organization_id')

Scanning root directory: /Users/vothao/a-story-of-LLMs-evolution/llm-leaderboard-main/data/organizations
Inserted: cohere
Inserted: moonshotai
Inserted: amazon
Inserted: google
Inserted: microsoft
Inserted: meta
Inserted: qwen
Inserted: deepseek
Inserted: xai
Inserted: ibm
Inserted: zai-org
Inserted: nvidia
Inserted: anthropic
Inserted: unknown
Inserted: ai21
Inserted: mistral
Inserted: openai

Loaded 17 openai files into 'Organizations MD'


In [None]:
# MERGE ORGANIZATION & PROVIDER DATA WITH AVERAGE BENCHMARK SCORE
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017")
db = client["admin2"]

org_coll = db["LLMs wrt Organizations"]
prov_coll = db["LLMs wrt Providers"]
perf_coll = db["LLM Performance"]
merged_coll = db["LLM Merged Organization and Provider"]

# Step 1: Left join Organization -> Provider
pipeline_org = [
    {
        "$lookup": {
            "from": "LLMs wrt Providers",
            "localField": "model_id",
            "foreignField": "model_id",
            "as": "provider_data"
        }
    },
    {
        "$addFields": {
            "provider_id": {"$arrayElemAt": ["$provider_data.provider_id", 0]}
        }
    },
    {
        "$project": {"provider_data": 0}  # remove temp array
    }
]

org_docs = list(org_coll.aggregate(pipeline_org))

# Step 2: Get provider-only model_ids (not in Organization)
org_model_ids = org_coll.distinct("model_id")
prov_only_docs = list(prov_coll.find({"model_id": {"$nin": org_model_ids}}))

# Step 3: Merge provider-only docs
org_sample_fields = org_coll.find_one() or {}
org_field_keys = [k for k in org_sample_fields.keys() if k != "_id"]

for doc in prov_only_docs:
    merged_doc = {
        **{k: None for k in org_field_keys},  # all org fields as None
        **doc  # overwrite model_id and provider_id from provider
    }
    org_docs.append(merged_doc)

# Step 4: Compute avg_benchmark_score for each doc
for doc in org_docs:
    scores = list(perf_coll.find({"model_id": doc["model_id"]}, {"normalized_score": 1, "_id": 0}))
    if scores:
        avg_score = sum(s["normalized_score"] for s in scores) / len(scores)
        doc["avg_benchmark_score"] = avg_score
    else:
        doc["avg_benchmark_score"] = None  # or 0

# Step 5: Insert/update merged collection
for doc in org_docs:
    merged_coll.update_one(
        {"model_id": doc["model_id"]},
        {"$set": doc},
        upsert=True
    )



Merged Organization & Provider with avg_benchmark_score successfully!


In [None]:
#change release_date from string to datetime
from datetime import datetime

# Fetch all documents
for doc in merged_coll.find({"release_date": {"$type": "string"}}):
    release_str = doc.get("release_date")
    if release_str:
        try:
            # Convert string to datetime object
            release_dt = datetime.fromisoformat(release_str.replace("Z", "+00:00"))
            merged_coll.update_one(
                {"_id": doc["_id"]},
                {"$set": {"release_date": release_dt}}
            )
        except Exception as e:
            print(f"Failed to convert for {doc['_id']}: {e}")

['admin', 'admin2', 'config', 'local']
