In [18]:
from pathlib import Path
from collections import Counter, defaultdict
import numpy as np
import gzip

def open_maybe_gz(path: Path):
    return gzip.open(path, "rt", encoding="utf-8", errors="ignore") if str(path).endswith(".gz") else open(path, "rt", encoding="utf-8", errors="ignore")

def detect_delimiter(path: Path):
    with open_maybe_gz(path) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            return "\t" if "\t" in line else ","
    return "\t"

def iter_edges(path: Path, delim: str):
    with open_maybe_gz(path) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split(delim)
            if len(parts) < 3:
                continue
            yield parts[0].strip(), parts[1].strip(), parts[2].strip()

def allocate_quotas(rel_counts: dict, total_edges: int, min_per_relation: int):
    rels = list(rel_counts.keys())
    R = len(rels)
    if R == 0:
        raise ValueError("هیچ relationای در فایل پیدا نشد!")

    min_per_relation = min(min_per_relation, max(1, total_edges // R))
    base = min_per_relation * R
    remaining = max(0, total_edges - base)

    total = sum(rel_counts.values())
    quotas = {r: min_per_relation for r in rels}

    if remaining > 0 and total > 0:
        frac = []
        for r in rels:
            share = remaining * (rel_counts[r] / total)
            add = int(np.floor(share))
            quotas[r] += add
            frac.append((share - add, r))

        diff = total_edges - sum(quotas.values())
        frac.sort(reverse=True)
        i = 0
        while diff > 0:
            quotas[frac[i % len(frac)][1]] += 1
            diff -= 1
            i += 1

    return quotas

def pick_input_file():
    raw_dir = Path("data/raw")
    if not raw_dir.exists():
        raise FileNotFoundError("پوشه data/raw پیدا نشد. اول به ریشه پروژه cd کن.")
    files = sorted([p for p in raw_dir.glob("*") if p.is_file()], key=lambda p: p.stat().st_size, reverse=True)
    if not files:
        raise FileNotFoundError("هیچ فایلی داخل data/raw نیست. دیتاست را آنجا کپی کن.")
    return files[0]

def make_stratified_subgraph(total_edges=120_000, min_per_relation=300, seed=42):
    input_path = pick_input_file()
    out_path = Path("data/raw") / f"drkg_subgraph_{total_edges//1000}k.tsv"

    delim = detect_delimiter(input_path)
    print("Input file:", input_path)
    print("Detected delimiter:", "TAB" if delim == "\t" else "COMMA")

    # pass1: relation counts
    rel_counts = Counter()
    for _, r, _ in iter_edges(input_path, delim):
        rel_counts[r] += 1

    print("Num relations:", len(rel_counts))
    print("Top5:", rel_counts.most_common(5))

    quotas = allocate_quotas(rel_counts, total_edges, min_per_relation)
    print("Total quota:", sum(quotas.values()))

    rng = np.random.default_rng(seed)
    seen = defaultdict(int)
    reservoirs = {r: [] for r in quotas.keys()}

    # pass2: reservoir per relation
    for h, r, t in iter_edges(input_path, delim):
        q = quotas.get(r, 0)
        if q <= 0:
            continue
        seen[r] += 1
        k = len(reservoirs[r])
        if k < q:
            reservoirs[r].append((h, r, t))
        else:
            j = rng.integers(0, seen[r])
            if j < q:
                reservoirs[r][j] = (h, r, t)

    all_edges = []
    for r in reservoirs:
        all_edges.extend(reservoirs[r])

    rng.shuffle(all_edges)

    with open(out_path, "w", encoding="utf-8") as f:
        for h, r, t in all_edges:
            f.write(f"{h}{delim}{r}{delim}{t}\n")

    print("Saved subgraph:", out_path)
    print("Edges:", len(all_edges))
    print("Relations covered:", sum(1 for r in reservoirs if len(reservoirs[r]) > 0), "/", len(reservoirs))
    return out_path

out_file = make_stratified_subgraph(total_edges=120_000, min_per_relation=300, seed=42)
out_file

Input file: data\raw\drkg.tsv
Detected delimiter: TAB
Num relations: 107
Top5: [('DRUGBANK::ddi-interactor-in::Compound:Compound', 1379271), ('Hetionet::GpBP::Gene:Biological Process', 559504), ('Hetionet::AeG::Anatomy:Gene', 526407), ('STRING::REACTION::Gene:Gene', 400426), ('STRING::CATALYSIS::Gene:Gene', 343533)]
Total quota: 120000
Saved subgraph: data\raw\drkg_subgraph_120k.tsv
Edges: 118308
Relations covered: 107 / 107


WindowsPath('data/raw/drkg_subgraph_120k.tsv')