### Imports

In [1]:
import numpy as np
import re, json
from matplotlib import pyplot as plt
from pymongo import MongoClient
from tqdm.notebook import tqdm
from pprint import PrettyPrinter
from typing import Union, Iterable
from bson.objectid import ObjectId

pprint = PrettyPrinter().pprint

In [2]:
db = MongoClient("mongodb://localhost:27017")["cuda_snippets"]
train_db = db["train"]
validation_db = db["validation"]
repo_metadata_db = db["repo_metadata"]

### Get used repositories info

In [3]:
repo_metadata_id_set = set(train_db.distinct("repo_metadata_id"))
repo_metadata_id_set.update(validation_db.distinct("repo_metadata_id"))

print(f"Used repos: {len(repo_metadata_id_set)}")

not_matched_kernels_repo_names = set(list(train_db.distinct("repo_name", {"$or" : [
            {"repo_metadata_id" : {"$exists" : False}},
            {"repo_metadata_id" : None}
        ]    
    })) + \
    list(validation_db.distinct("repo_name", {"$or" : [
            {"repo_metadata_id" : {"$exists" : False}},
            {"repo_metadata_id" : None}
        ]    
    })))
    
print(f"Not matched kernel count: {len(not_matched_kernels_repo_names)}")

Used repos: 6455
Not matched kernel count: 574


In [4]:
repo_metadata_dict = {
    str(repo_metadata.get("_id")) : repo_metadata
    for repo_metadata in repo_metadata_db.find({"_id" : {"$in" : [ObjectId(idx) for idx in repo_metadata_id_set]}})   
}
print(f"Got {len(repo_metadata_dict)} repos")

Got 6454 repos


Find not-matched id

In [15]:
for idx in repo_metadata_id_set:
    if idx not in repo_metadata_dict:
        print(idx)

None


## After filling the rest of the repo data

### Get repo wanted info

In [None]:

used_repos_ids = set(train_db.distinct("repo_metadata_id"))
used_repos_ids.update(validation_db.distinct("repo_metadata_id"))
if None in used_repos_ids:
    used_repos_ids.remove(None)

In [None]:
sort_criteria_keys = ["stargazer_count", "subscriber_count", "watcher_count"]

repo_info = list(repo_metadata_db.aggregate([
    {"$match" : {"status" : "READY", "_id" : {"$in" : [ObjectId(idx) for idx in used_repos_ids]}}},
    {"_id" : "$_id", "$project" : {"full_name" : "$full_name", **{val : f"${val}" for val in sort_criteria_keys}}}
]))

print(f"Found {len(used_repos_ids)} out of the total of {len(repo_info)}")

In [None]:
repo_score = {}
for repo in repo_info:
    