In [None]:
from pathlib import Path
from tqdm.auto import tqdm
import shutil
import pandas as pd
import ray
from toolkit_run.ray.server import LabRayToolkitServer
import json

In [None]:
server = LabRayToolkitServer()
server.dashboard_url

In [None]:
server.scale_cluster(60)

## Repos count and size of files for java

In [None]:
paths = list(Path('/data/hf_repos/the-stack-metadata').glob('data/*'))

In [None]:
@ray.remote(scheduling_strategy="SPREAD")
def get_java_repos_stats(bucket_path):
    df_fi = pd.read_parquet(
        bucket_path / 'fi.parquet'
    )
    df_fi = df_fi[(df_fi['lang_ex'] == 'Java') & (df_fi['is_deleted'] == False)]
    return df_fi['ri_id'].nunique(), df_fi['size'].sum()
    
    

In [None]:
res = []
for path in paths:
    res += [get_java_repos_stats.remote(path)]


In [None]:
res = ray.get(res)

In [None]:
res = [sum(el) for el in list(zip(*res))]

In [None]:
print('count: ', res[0])
print('size: ', res[1] / 1000000000000, 'Tb')

## Dedup repo count and names by stars

In [None]:
paths = list(Path('/data/hf_repos/the_stack_v1_1_near_dedup_parquet/data/java/').glob('*.parquet'))

In [None]:
df = pd.read_parquet(paths[0])

In [None]:
df

In [None]:
len(set(df['max_stars_repo_name']))

In [None]:
@ray.remote(scheduling_strategy="SPREAD")
def get_java_repos_stats(bucket_filename):
    df = pd.read_parquet(bucket_filename)
    return set(df['max_stars_repo_name'])
    

In [None]:
res = []
for path in paths:
    res += [get_java_repos_stats.remote(path)]

In [None]:
res = ray.get(res)

In [None]:
repos = set().union(*res)

In [None]:
len(repos)

## Size of those repos

In [None]:
paths = list(Path('/data/hf_repos/the-stack-metadata').glob('data/*'))

In [None]:
df_ri = pd.read_parquet(
    paths[0] / 'ri.parquet'
)
ri_ids = set(df_ri[df_ri['name'].isin(repos)]['id'])
df_fi = pd.read_parquet(
    paths[0] / 'fi.parquet'
)
df_fi = df_fi[(df_fi['ri_id'].isin(ri_ids)) & (df_fi['lang_ex'] == 'Java') & (df_fi['is_deleted'] == False)]

In [None]:
df_fi

In [None]:
@ray.remote(scheduling_strategy="SPREAD")
def get_java_repos_stats(bucket_path, repos):
    df_ri = pd.read_parquet(
        bucket_path / 'ri.parquet'
    )
    ri_ids = set(df_ri[df_ri['name'].isin(repos)]['id'])
    df_fi = pd.read_parquet(
        bucket_path / 'fi.parquet'
    )
    df_fi = df_fi[(df_fi['ri_id'].isin(ri_ids)) & (df_fi['lang_ex'] == 'Java') & (df_fi['is_deleted'] == False)]
    return df_fi['ri_id'].nunique(), df_fi['size'].sum(), len(df_fi)

In [None]:
repos_ref = ray.put(repos)

In [None]:
res = []
for path in paths:
    res += [get_java_repos_stats.remote(path, repos_ref)]

In [None]:
res = ray.get(res)

In [None]:
res = [sum(el) for el in list(zip(*res))]

In [None]:
print('repos count: ', res[0])
print('size: ', res[1] / 1000000000000, 'Tb')
print('files count: ', res[2])

In [None]:
server.scale_cluster(0)

## Assign and save repository to bucket mapping info

In [None]:
len(repos)

In [None]:
usernames = set(el.split('/')[0] for el in repos)

In [None]:
len(usernames)

In [None]:
repos = list(repos)

In [None]:
repos.sort()

In [None]:
repo_2_bucket = {}
for i, el in enumerate(repos):
        repo_2_bucket[el] = i // 1000

In [None]:
repo_2_bucket[repos[-1]]

In [None]:
with open('/data/hf_repos/the-stak-repo-level/meta_data/java/repo_to_bucket.json', 'wt') as f:
    json.dump(repo_2_bucket, f)