In [5]:
from huggingface_hub import HfApi
import polars as pl

api = HfApi()
files = api.list_repo_tree("PleIAs/YouTube-Commons", repo_type="dataset")
parquet_files = sorted([f.path for f in files if f.path.endswith(".parquet")])
print(f"Found {len(parquet_files)} parquet files")

Found 439 parquet files


In [6]:
TESTING = False  # Make False to process all files

BASE_URL = "https://huggingface.co/datasets/PleIAs/YouTube-Commons/resolve/main/"
urls = [BASE_URL + path for path in parquet_files]

if TESTING:
    urls = urls[:2]
    print(f"TESTING MODE: processing {len(urls)} files")
else:
    print(f"Processing all {len(urls)} files")

all_ids = []
for i, url in enumerate(urls):
    df = pl.read_parquet(url, columns=["video_id"])
    all_ids.extend(df["video_id"].to_list())
    print(f"[{i+1}/{len(urls)}] {parquet_files[i if TESTING else i]}: {df.height} IDs (running total: {len(all_ids)})")

print(f"\nTotal video IDs collected: {len(all_ids)}")

Processing all 439 files
[1/439] cctube_0.parquet: 49967 IDs (running total: 49967)
[2/439] cctube_1.parquet: 49958 IDs (running total: 99925)
[3/439] cctube_10.parquet: 49949 IDs (running total: 149874)
[4/439] cctube_100.parquet: 49960 IDs (running total: 199834)
[5/439] cctube_101.parquet: 49958 IDs (running total: 249792)
[6/439] cctube_102.parquet: 49975 IDs (running total: 299767)
[7/439] cctube_103.parquet: 49961 IDs (running total: 349728)
[8/439] cctube_104.parquet: 49953 IDs (running total: 399681)
[9/439] cctube_105.parquet: 49966 IDs (running total: 449647)
[10/439] cctube_106.parquet: 110574 IDs (running total: 560221)
[11/439] cctube_107.parquet: 49961 IDs (running total: 610182)
[12/439] cctube_108.parquet: 112473 IDs (running total: 722655)
[13/439] cctube_109.parquet: 49956 IDs (running total: 772611)
[14/439] cctube_11.parquet: 49964 IDs (running total: 822575)
[15/439] cctube_110.parquet: 49967 IDs (running total: 872542)
[16/439] cctube_111.parquet: 49958 IDs (runni

In [7]:
from pathlib import Path

unique_ids = sorted(set(all_ids))
print(f"Total: {len(all_ids)} → Unique: {len(unique_ids)} ({len(all_ids) - len(unique_ids)} duplicates removed)")

output_path = Path("cc_video_ids.txt")
output_path.write_text("\n".join(unique_ids) + "\n")
print(f"Saved to {output_path.resolve()} ({output_path.stat().st_size / 1024:.1f} KB)")


Total: 22684737 → Unique: 3156666 (19528071 duplicates removed)
Saved to /home/doga/Desktop/thesis/superorganism-experiment/self_replication_service__mycelium/mycelium-bootstrap/yt-api-cc-scripts/cc_video_ids.txt (36992.2 KB)

Sample IDs:
  ---9UsTzXKU  https://youtube.com/watch?v=---9UsTzXKU
  ---9pbAy5X8  https://youtube.com/watch?v=---9pbAy5X8
  ---A_uo_iD0  https://youtube.com/watch?v=---A_uo_iD0
  ---BWbMr5B0  https://youtube.com/watch?v=---BWbMr5B0
  ---Dbi2PoR8  https://youtube.com/watch?v=---Dbi2PoR8
