In [None]:
import pandas as pd
from cyvcf2 import VCF

In [None]:
DOWNLOAD_PATH = "./downloads"

In [None]:
vcf_path = f"{DOWNLOAD_PATH}/clinvar.vcf.gz"

In [None]:
from cyvcf2 import VCF

def collect_unique_col_values(vcf_path: str, max_rows: int | None = None):
    """
    Iterate through a ClinVar VCF and collect all unique CLNSIG and CLNREVSTAT values.

    Parameters
    ----------
    vcf_path : str
        Path to the ClinVar VCF file (.vcf or .vcf.gz).
    max_rows : int | None
        Optional limit for testing (stop after N records).

    Returns
    -------
    tuple[set[str], set[str]]
        (unique_clnsig_values, unique_clnrevstat_values)
    """
    vcf = VCF(vcf_path)
    clnsig_values = set()
    clnrevstat_values = set()
    clnvc_values = set()

    for i, v in enumerate(vcf):
        info = v.INFO

        # Extract CLNSIG and CLNREVSTAT (may be pipe-delimited)
        clnsig_raw = info.get("CLNSIG")
        clnrevstat_raw = info.get("CLNREVSTAT")
        clnvc_raw = info.get("CLNVC")

        if clnsig_raw:
            for sig in str(clnsig_raw).split("|"):
                if sig.strip():
                    clnsig_values.add(sig.strip())

        if clnrevstat_raw:
            for rev in str(clnrevstat_raw).split("|"):
                if rev.strip():
                    clnrevstat_values.add(rev.strip())
        if clnvc_raw:
            for rev in str(clnvc_raw).split("|"):
                if rev.strip():
                    clnvc_values.add(rev.strip())

        if max_rows and i >= max_rows:
            break

    return clnsig_values, clnrevstat_values, clnvc_values


In [None]:
from cyvcf2 import VCF

def collect_unique_clnsig_revstat_clnvc(vcf_path: str, max_rows: int | None = None):
    """
    Iterate through a ClinVar VCF and collect unique values for
    CLNSIG, CLNREVSTAT, and CLNVC.

    Returns
    -------
    tuple[set[str], set[str], set[str]]
        (unique_clnsig_values, unique_clnrevstat_values, unique_clnvc_values)
    """
    vcf = VCF(vcf_path)
    clnsig_values: set[str] = set()
    clnrevstat_values: set[str] = set()
    clnvc_values: set[str] = set()

    for i, v in enumerate(vcf):
        info = v.INFO

        clnsig_raw = info.get("CLNSIG")
        if clnsig_raw:
            for sig in str(clnsig_raw).split("|"):
                sig = sig.strip()
                if sig:
                    clnsig_values.add(sig)

        clnrevstat_raw = info.get("CLNREVSTAT")
        if clnrevstat_raw:
            for rev in str(clnrevstat_raw).split("|"):
                rev = rev.strip()
                if rev:
                    clnrevstat_values.add(rev)

        clnvc_raw = info.get("CLNVC")
        if clnvc_raw:
            # usually single-valued, but split defensively
            for vt in str(clnvc_raw).split("|"):
                vt = vt.strip()
                if vt:
                    clnvc_values.add(vt)

        if max_rows and (i + 1) >= max_rows:
            break

    return clnsig_values, clnrevstat_values, clnvc_values


In [None]:
clnsigs, revstats, clnvc_values = collect_unique_clnsig_revstat_clnvc(f"{DOWNLOAD_PATH}/clinvar.vcf")

print("CLNSIG unique values:", sorted(clnsigs))
print("CLNREVSTAT unique values:", sorted(revstats))
print("CLNVC unique values:", sorted(clnvc_values))

In [None]:
ALL_CLINSIG = [
  "Affects",
  "Benign",
  "Benign/Likely_benign",
  "Conflicting_classifications_of_pathogenicity",
  "Established_risk_allele",
  "Likely_benign",
  "Likely_pathogenic",
  "Likely_pathogenic,_low_penetrance",
  "Likely_pathogenic/Likely_risk_allele",
  "Likely_risk_allele",
  "Pathogenic",
  "Pathogenic/Likely_pathogenic",
  "Pathogenic/Likely_pathogenic/Likely_risk_allele",
  "Pathogenic/Likely_pathogenic/Pathogenic,_low_penetrance",
  "Pathogenic/Likely_risk_allele",
  "Pathogenic/Pathogenic,_low_penetrance",
  "Uncertain_risk_allele",
  "Uncertain_significance",
  "Uncertain_significance/Uncertain_risk_allele",
  "association",
  "association_not_found",
  "confers_sensitivity",
  "drug_response",
  "no_classification_for_the_single_variant",
  "no_classifications_from_unflagged_records",
  "not_provided",
  "other",
  "protective",
  "risk_factor"
]

In [None]:
ALL_CLNREVSTAT = [
  "criteria_provided,_conflicting_classifications",
  "criteria_provided,_multiple_submitters,_no_conflicts",
  "criteria_provided,_single_submitter",
  "no_assertion_criteria_provided",
  "no_classification_for_the_single_variant",
  "no_classification_provided",
  "no_classifications_from_unflagged_records",
  "practice_guideline",
  "reviewed_by_expert_panel"
]

In [None]:
from cyvcf2 import VCF
import pandas as pd

def clinvar_df_for_gene(vcf_path: str, gene_name: str, max_rows: int | None = None) -> pd.DataFrame:
    """
    Filter a ClinVar VCF for a given gene symbol and return a pandas DataFrame.

    Parameters
    ----------
    vcf_path : str
        Path to ClinVar VCF or .vcf.gz file.
    gene_name : str
        Gene symbol to match (case-insensitive), e.g. "BRCA1".
    max_rows : int | None
        Optional limit for preview/testing.

    Returns
    -------
    pd.DataFrame
        Columns include: CHROM, POS, ID, REF, ALT, RS, RS_prefixed,
        CLNSIG, GENEINFO, CLNVC, CLNREVSTAT, ORIGIN, ALLELEID
    """
    vcf = VCF(vcf_path)
    gene_name = gene_name.upper()
    records = []

    for v in vcf:
        info = v.INFO
        geneinfo = info.get("GENEINFO")
        if not geneinfo:
            continue

        # Parse e.g. "BRCA1:672|BRCA2:675" -> ["BRCA1", "BRCA2"]
        genes = [g.split(":")[0].upper() for g in geneinfo.split("|")]
        if gene_name not in genes:
            continue

        rs_raw = info.get("RS")
        rs_prefixed = None
        if rs_raw:
            # Sometimes RS may be a list-like string: "80357065,12345"
            tokens = [t.strip() for t in str(rs_raw).replace("|", ",").split(",") if t.strip()]
            rs_prefixed = [f"rs{t}" for t in tokens if t.isdigit()]
            if len(rs_prefixed) == 1:
                rs_prefixed = rs_prefixed[0]

        rec = {
            "CHROM": v.CHROM,
            "POS": v.POS,
            "ID": v.ID,                 # ClinVar internal variant ID
            "REF": v.REF,
            "ALT": v.ALT[0] if v.ALT else None,
            "RS": rs_raw,
            "RS_prefixed": rs_prefixed,
            "CLNSIG": info.get("CLNSIG"),
            "GENEINFO": geneinfo,
            "CLNVC": info.get("CLNVC"),
            "CLNREVSTAT": info.get("CLNREVSTAT"),
            "ORIGIN": info.get("ORIGIN"),
            "ALLELEID": info.get("ALLELEID"),
        }
        records.append(rec)

        if max_rows and len(records) >= max_rows:
            break

    return pd.DataFrame(records)


In [None]:
# All observed ClinVar classifications (CLNSIG) in your dataset
ALL_CLINSIG = [
    'Benign',
    'Pathogenic',
    'Uncertain_significance',
    'Likely_benign',
    'Conflicting_classifications_of_pathogenicity',
    'Benign/Likely_benign',
    'Likely_pathogenic',
    'Pathogenic/Likely_pathogenic',
    'not_provided',
    'no_classification_for_the_single_variant',
    'no_classifications_from_unflagged_records',
]

In [None]:
import pandas as pd

# Default variant classes (SNP-array–friendly)
DEFAULT_ALLOWED_TYPES = [
    "single_nucleotide_variant",
    "Indel",
    "Deletion",
    "Insertion",
]

# Default clinical significance classes (disease-causing)
DEFAULT_CLINSIG = [
    "Pathogenic",
    "Likely_pathogenic",
    "Pathogenic/Likely_pathogenic",
]

# Default high-confidence review statuses
DEFAULT_REVSTAT = [
    "practice_guideline",
    "reviewed_by_expert_panel",
]

def clinvar_df_for_gene_filtered(
    vcf_path: str,
    gene_name: str,
    allowed_types: list[str] | None = None,
    allowed_clnsig: list[str] | None = None,
    allowed_revstat: list[str] | None = None,
    max_rows: int | None = None,
    case_insensitive: bool = True,
    dropna_clnvc: bool = True,
    dropna_clnsig: bool = True,
    dropna_revstat: bool = True,
) -> pd.DataFrame:
    """
    Call `clinvar_df_for_gene` and keep only rows whose CLNVC, CLNSIG, and CLNREVSTAT
    match the allowed lists.

    Parameters
    ----------
    vcf_path : str
        Path to ClinVar VCF/.vcf.gz.
    gene_name : str
        Gene symbol to match (e.g., "TP53").
    allowed_types : list[str] | None
        Variant classes to keep (CLNVC values). If None, uses DEFAULT_ALLOWED_TYPES.
    allowed_clnsig : list[str] | None
        Clinical significance categories to keep (CLNSIG values).
        If None, uses DEFAULT_CLINSIG.
    allowed_revstat : list[str] | None
        Review statuses to keep (CLNREVSTAT values).
        If None, uses DEFAULT_REVSTAT (practice_guideline, reviewed_by_expert_panel).
    max_rows : int | None
        Passed through to clinvar_df_for_gene for preview/testing.
    case_insensitive : bool
        If True, compare values case-insensitively.
    dropna_clnvc : bool
        If True, drop rows where CLNVC is NA before filtering.
    dropna_clnsig : bool
        If True, drop rows where CLNSIG is NA before filtering.
    dropna_revstat : bool
        If True, drop rows where CLNREVSTAT is NA before filtering.

    Returns
    -------
    pd.DataFrame
        Filtered DataFrame.
    """
    allowed_types = allowed_types or DEFAULT_ALLOWED_TYPES
    allowed_clnsig = allowed_clnsig or DEFAULT_CLINSIG
    allowed_revstat = allowed_revstat or DEFAULT_REVSTAT

    # You must have defined `clinvar_df_for_gene(vcf_path, gene_name, max_rows)` elsewhere.
    df = clinvar_df_for_gene(vcf_path, gene_name, max_rows=max_rows)

    # Drop NAs first (only if those columns exist)
    if dropna_clnvc and "CLNVC" in df.columns:
        df = df.dropna(subset=["CLNVC"])
    if dropna_clnsig and "CLNSIG" in df.columns:
        df = df.dropna(subset=["CLNSIG"])
    if dropna_revstat and "CLNREVSTAT" in df.columns:
        df = df.dropna(subset=["CLNREVSTAT"])

    # Build masks
    if case_insensitive:
        # Normalize allowed lists to lowercase sets
        type_norm = {t.lower() for t in allowed_types}
        sig_norm = {s.lower() for s in allowed_clnsig}
        rev_norm = {r.lower() for r in allowed_revstat}

        mask_type = df["CLNVC"].astype(str).str.lower().isin(type_norm) if "CLNVC" in df.columns else True
        mask_sig  = df["CLNSIG"].astype(str).str.lower().isin(sig_norm) if "CLNSIG" in df.columns else True
        mask_rev  = df["CLNREVSTAT"].astype(str).str.lower().isin(rev_norm) if "CLNREVSTAT" in df.columns else True
    else:
        mask_type = df["CLNVC"].isin(allowed_types) if "CLNVC" in df.columns else True
        mask_sig  = df["CLNSIG"].isin(allowed_clnsig) if "CLNSIG" in df.columns else True
        mask_rev  = df["CLNREVSTAT"].isin(allowed_revstat) if "CLNREVSTAT" in df.columns else True

    df_filtered = df[mask_type & mask_sig & mask_rev].reset_index(drop=True)
    return df_filtered


In [None]:
df_brca1 = clinvar_df_for_gene_filtered(
    vcf_path=vcf_path,
    gene_name="BRCA1",
    allowed_types=["single_nucleotide_variant", "Indel", "Deletion", "Insertion"],
    allowed_clnsig=["Pathogenic", "Likely_pathogenic", "Pathogenic/Likely_pathogenic"],
    allowed_revstat=["practice_guideline", "reviewed_by_expert_panel"]
)

In [None]:
df_brca1

In [None]:
import pandas as pd

def count_clnrevstat(df: pd.DataFrame) -> pd.DataFrame:
    """
    Count every unique value in the CLNREVSTAT column of a ClinVar DataFrame.
    """
    if "CLNREVSTAT" not in df.columns:
        raise ValueError("Column 'CLNREVSTAT' not found in DataFrame")

    counts = df["CLNREVSTAT"].value_counts(dropna=False).reset_index()
    counts.columns = ["CLNREVSTAT", "count"]
    return counts


def count_clnsig(df: pd.DataFrame) -> pd.DataFrame:
    """
    Count every unique value in the CLNSIG column of a ClinVar DataFrame.
    """
    if "CLNSIG" not in df.columns:
        raise ValueError("Column 'CLNSIG' not found in DataFrame")

    counts = df["CLNSIG"].value_counts(dropna=False).reset_index()
    counts.columns = ["CLNSIG", "count"]
    return counts


def count_clnvc(df: pd.DataFrame) -> pd.DataFrame:
    """
    Count every unique value in the CLNVC (variant type) column of a ClinVar DataFrame.
    """
    if "CLNVC" not in df.columns:
        raise ValueError("Column 'CLNVC' not found in DataFrame")

    counts = df["CLNVC"].value_counts(dropna=False).reset_index()
    counts.columns = ["CLNVC", "count"]
    return counts


In [None]:
count_clnrevstat(df_brca1)

In [None]:
count_clnsig(df_brca1)

In [None]:
count_clnvc(df_brca1)

In [None]:
def summarize_clinvar_counts(df: pd.DataFrame) -> None:
    """
    Print counts for CLNREVSTAT, CLNSIG, and CLNVC columns
    from a ClinVar DataFrame.
    """
    def _count_column(df, colname):
        if colname not in df.columns:
            print(f"⚠️ Column '{colname}' not found in DataFrame.\n")
            return pd.DataFrame()
        counts = df[colname].value_counts(dropna=False).reset_index()
        counts.columns = [colname, "count"]
        return counts

    print("=== CLNREVSTAT (Review Status) ===")
    print(_count_column(df, "CLNREVSTAT").to_string(index=False))
    print("\n")

    print("=== CLNSIG (Clinical Significance) ===")
    print(_count_column(df, "CLNSIG").to_string(index=False))
    print("\n")

    print("=== CLNVC (Variant Class) ===")
    print(_count_column(df, "CLNVC").to_string(index=False))
    print("\n")


In [None]:
summarize_clinvar_counts(df_brca1)

In [None]:
import pandas as pd

def export_clinvar_tsv(df: pd.DataFrame, out_path: str = "./clinvar_export.tsv") -> pd.DataFrame:
    """
    Export a ClinVar gene DataFrame to a simplified TSV format similar to genotype tables.

    Output columns:
        rsid, gene, chromosome, position, ref, alt, clnrevstat, clnsig, clnvc

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame from `clinvar_df_for_gene` or similar.
    out_path : str
        Output path for the TSV file.

    Returns
    -------
    pd.DataFrame
        The subset DataFrame that was written to file.
    """
    # Flexible field mapping
    colmap = {
        "RS_prefixed": "rsid",
        "GENEINFO": "gene",
        "CHROM": "chromosome",
        "POS": "position",
        "REF": "ref",
        "ALT": "alt",
        "CLNREVSTAT": "clnrevstat",
        "CLNSIG": "clnsig",
        "CLNVC": "clnvc",
    }

    # Ensure all columns exist before mapping
    missing = [c for c in colmap if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns for export: {missing}")

    export_df = df[list(colmap.keys())].rename(columns=colmap)

    # Optional cleanup for readability
    export_df["gene"] = export_df["gene"].str.split(":").str[0]  # strip gene ID part
    export_df = export_df.fillna("")  # no NaNs in TSV

    export_df.to_csv(out_path, sep="\t", index=False)
    print(f"[clinvar] exported {len(export_df):,} rows to {out_path}")

    return export_df


In [None]:
export_df = export_clinvar_tsv(df_brca1, "./work/brca1_clinvar.tsv")