In [2]:
!ls /mnt/ligandpro/db/LPCE/

BindingMOAD  bioml  final  move.py  PDBbind  processed	raw  separated


In [10]:
from pathlib import Path
import json
from collections import Counter
import pandas as pd
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor

base_lpce = Path("/mnt/ligandpro/db/LPCE")
priority  = ["final", "separated", "bioml", "processed", "raw"]
root_data = Path("/mnt/ligandpro/data/marina/sergei")
json_path = Path("/home/nikolenko/work/Projects/LPCE/data/removed_files.json")

tests_df   = pd.read_table(root_data / "tests.tsv")
moad_df    = pd.read_table(root_data / "moad.tsv")
pdbbind_df = pd.read_table(root_data / "pdbbind.tsv")

def extract_ids(df: pd.DataFrame) -> set[str]:
    for col in ("pdb", "pdbid", "pdb_id", "PDB", "PDBID"):
        if col in df.columns:
            return {str(x).strip().lower()[:4] for x in df[col]}
    return {str(x).strip().lower()[:4] for x in df.index}

datasets = {
    "tests":   extract_ids(tests_df),
    "moad":    extract_ids(moad_df),
    "pdbbind": extract_ids(pdbbind_df),
}

found = {}

def grab_codes(pattern: str) -> set[str]:
    root = base_lpce / "raw" / "pdb"
    return {f.name[3:7].lower() for f in root.rglob(pattern)}

with ThreadPoolExecutor(max_workers=2) as pool:
    codes_raw = set().union(*pool.map(grab_codes, ["pdb????.ent.gz", "pdb????.cif.gz"]))
found["raw"] = codes_raw

for d in tqdm(priority[:-1], desc="Scanning top-level"):  # без raw
    p = base_lpce / d
    found[d] = {
        item.name.split("_", 1)[0].split(".", 1)[0].lower()[:4]
        for item in p.iterdir()
        if item.is_file() or item.is_dir()
    } if p.is_dir() else set()

with json_path.open() as fh:
    data = json.load(fh)
reason_map = {p.lower()[:4]: cat for cat, lst in data.items() for p in lst}

summary_rows, reason_rows, missing_all = [], [], {}

for name, ids in datasets.items():
    assigned = set()
    counts   = {}
    for d in priority:
        unique = (ids & found[d]) - assigned
        counts[d] = len(unique)
        assigned |= unique
    not_found = ids - assigned

    summary_rows.append(dict(dataset=name, total=len(ids), **counts, not_found=len(not_found)))

    rc = Counter(reason_map.get(pid, "not_in_json") for pid in not_found)
    for r, c in rc.items():
        reason_rows.append(dict(dataset=name, reason=r, count=c))

    missing_all[name] = {pid: reason_map.get(pid, "not_in_json") for pid in sorted(not_found)}

pd.set_option("display.max_rows", None)
print("\n=== Summary (unique per priority) ===")
print("final → separated → bioml → processed → raw")
print("\nКаждая запись засчитывается ровно в первой папке по приоритету\n")
print(pd.DataFrame(summary_rows).to_string(index=False))

print("\n=== Breakdown of not_found reasons ===")
print(pd.DataFrame(reason_rows).sort_values(["dataset", "count"], ascending=[True, False]).to_string(index=False))


Scanning top-level:   0%|          | 0/4 [00:00<?, ?it/s]


=== Summary (unique per priority) ===
final → separated → bioml → processed → raw

Каждая запись засчитывается ровно в первой папке по приоритету

dataset  total  final  separated  bioml  processed  raw  not_found
  tests    834    785          0     16          0    0         33
   moad  17832  17391          0    420          0   19          2
pdbbind  19440  18060          0   1350          0   19         11

=== Breakdown of not_found reasons ===
dataset      reason  count
   moad not_in_json      2
pdbbind not_in_json     11
  tests not_in_json     33
