In [2]:
import numpy as np
import re, json
from matplotlib import pyplot as plt
from pymongo import MongoClient
from tqdm.notebook import tqdm
from pprint import PrettyPrinter
from typing import Union, Iterable

pprint = PrettyPrinter().pprint

In [3]:
db = MongoClient("mongodb://localhost:27017")["cuda_snippets"]
train_db = db["train"]
validation_db = db["validation"]
repo_metadata_db = db["repo_metadata"]
file_metadata_db = db["file_metadata"]

In [7]:
repo_metadata_db.create_index("full_name")

file_metadata_db.create_index("repo_name")

'repo_name_1'

In [3]:
with open("../utils/repo_metadata.json", "r") as fd:
    repo_archive_metadata_content = json.load(fd)

In [6]:
repo_metadata_dict = {}
already_present = 0
for full_name, metadata in repo_archive_metadata_content.items():
    if metadata.get("status") != None:
        continue

    if full_name in repo_metadata_dict:
        already_present += 1
    else:
        repo_metadata_dict[metadata.get("files")[0][:-1]] = full_name

print(f"Total repos: {len(repo_archive_metadata_content)}")
print(f"Mapped: {len(repo_metadata_dict)}")
print(f"Duplicated names: {already_present}")

mapped = 0
not_mapped = []

# Iterate over train collection
for kernel in tqdm(train_db.find({})):
    repo_kernel_name = kernel.get("repo_name")
    if repo_kernel_name in repo_metadata_dict:
        mapped += 1
        if kernel.get("repo_metadata_id") is not None:
            continue
        repo_metadata = repo_metadata_db.find_one({"full_name" : repo_metadata_dict[repo_kernel_name]})
        train_db.find_one_and_update({"_id" : kernel.get("_id")}, {"$set" : {"repo_metadata_id" : str(repo_metadata.get("_id")) if repo_metadata else None}})
    else:
        not_mapped.append(kernel.get("_id"))

# Iterate over validation collection
for kernel in tqdm(validation_db.find({})):
    repo_kernel_name = kernel.get("repo_name")
    if repo_kernel_name in repo_metadata_dict:
        mapped += 1
        if kernel.get("repo_metadata_id") is not None:
            continue
        repo_metadata = repo_metadata_db.find_one({"full_name" : repo_metadata_dict[repo_kernel_name]})
        validation_db.find_one_and_update({"_id" : kernel.get("_id")}, {"$set" : {"repo_metadata_id" : str(repo_metadata.get("_id")) if repo_metadata else None}})
    else:
        not_mapped.append(str(kernel.get("_id")))

# Update file metadata
for archive_name, full_name in tqdm(repo_metadata_dict.items()):
    repo_metadata = repo_metadata_db.find_one({"full_name" : full_name})
    file_metadata_db.update_many({"repo_name" : archive_name}, {"$set" : { "repo_metadata_id" : str(repo_metadata.get("_id")) if repo_metadata else None}})


print(f"Found: {mapped}")
print(f"Not found : {len(not_mapped)}")
print(f"Ratio: {mapped/(mapped+len(not_mapped)):.2%}")

Total repos: 25130
Mapped: 13865
Duplicated names: 0


0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/13865 [00:00<?, ?it/s]

Found: 480791
Not found : 5623
Ratio: 98.84%


In [6]:
not_mapped_kernels = list(train_db.find({"repo_metadata_id" : {"$exists" : False}}))
not_mapped_kernels.extend(list(validation_db.find({"repo_metadata_id" : {"$exists" : False}})))

print(f"Got {len(not_mapped_kernels)} not mapped kernels")

not_mapped_repos = set([kernel.get("repo_name") for kernel in not_mapped_kernels])

print(f"Got {len(not_mapped_repos)} not mapped repos")
print(not_mapped_repos)


Got 36 not mapped kernels
Got 1 not mapped repos
{'tsne-cuda-main'}


In [7]:
def update_repo_dependencies(repo_name : str, repo_full_name):
    repo_metadata = repo_metadata_db.find_one({"full_name" : repo_full_name})
    
    file_metadata_db.update_many({"repo_name" : repo_name}, {"$set" : { "repo_metadata_id" : str(repo_metadata.get("_id"))}})
    train_db.update_many({"repo_name" : repo_name}, {"$set" : { "repo_metadata_id" : str(repo_metadata.get("_id"))}})
    validation_db.update_many({"repo_name" : repo_name}, {"$set" : { "repo_metadata_id" : str(repo_metadata.get("_id"))}})

manual_addon = {
    "cuml-branch-23.04" : "rapidsai/cuml",
    "cupy-master" : "cupy/cupy",
    "cudf-branch-23.04" : "rapidsai/cudf",
    "thrust-main" : "NVIDIA/thrust",
    "cutlass-master" : "NVIDIA/cutlass",
    "alien-develop" : "chrxh/alien",
    "tsne-cuda-main" : "CannyLab/tsne-cuda"
}

for repo_name, repo_full_name in manual_addon.items():
    update_repo_dependencies(repo_name, repo_full_name)
    
not_mapped_kernels = train_db.count_documents({"repo_metadata_id" : {"$exists" : False}}) + \
    validation_db.count_documents({"repo_metadata_id" : {"$exists" : False}})

print(f"Not mapped after addon: {not_mapped_kernels}")


Not mapped after addon: 0
