In [1]:
import pandas as pd
from cyvcf2 import VCF

In [2]:
DOWNLOAD_PATH = "./downloads"

In [3]:
vcf_path = f"{DOWNLOAD_PATH}/clinvar.vcf.gz"

In [4]:
from cyvcf2 import VCF

def collect_unique_col_values(vcf_path: str, max_rows: int | None = None):
    """
    Iterate through a ClinVar VCF and collect all unique CLNSIG and CLNREVSTAT values.

    Parameters
    ----------
    vcf_path : str
        Path to the ClinVar VCF file (.vcf or .vcf.gz).
    max_rows : int | None
        Optional limit for testing (stop after N records).

    Returns
    -------
    tuple[set[str], set[str]]
        (unique_clnsig_values, unique_clnrevstat_values)
    """
    vcf = VCF(vcf_path)
    clnsig_values = set()
    clnrevstat_values = set()
    clnvc_values = set()

    for i, v in enumerate(vcf):
        info = v.INFO

        # Extract CLNSIG and CLNREVSTAT (may be pipe-delimited)
        clnsig_raw = info.get("CLNSIG")
        clnrevstat_raw = info.get("CLNREVSTAT")
        clnvc_raw = info.get("CLNVC")

        if clnsig_raw:
            for sig in str(clnsig_raw).split("|"):
                if sig.strip():
                    clnsig_values.add(sig.strip())

        if clnrevstat_raw:
            for rev in str(clnrevstat_raw).split("|"):
                if rev.strip():
                    clnrevstat_values.add(rev.strip())
        if clnvc_raw:
            for rev in str(clnvc_raw).split("|"):
                if rev.strip():
                    clnvc_values.add(rev.strip())

        if max_rows and i >= max_rows:
            break

    return clnsig_values, clnrevstat_values, clnvc_values


In [5]:
from cyvcf2 import VCF

def collect_unique_clnsig_revstat_clnvc(vcf_path: str, max_rows: int | None = None):
    """
    Iterate through a ClinVar VCF and collect unique values for
    CLNSIG, CLNREVSTAT, and CLNVC.

    Returns
    -------
    tuple[set[str], set[str], set[str]]
        (unique_clnsig_values, unique_clnrevstat_values, unique_clnvc_values)
    """
    vcf = VCF(vcf_path)
    clnsig_values: set[str] = set()
    clnrevstat_values: set[str] = set()
    clnvc_values: set[str] = set()

    for i, v in enumerate(vcf):
        info = v.INFO

        clnsig_raw = info.get("CLNSIG")
        if clnsig_raw:
            for sig in str(clnsig_raw).split("|"):
                sig = sig.strip()
                if sig:
                    clnsig_values.add(sig)

        clnrevstat_raw = info.get("CLNREVSTAT")
        if clnrevstat_raw:
            for rev in str(clnrevstat_raw).split("|"):
                rev = rev.strip()
                if rev:
                    clnrevstat_values.add(rev)

        clnvc_raw = info.get("CLNVC")
        if clnvc_raw:
            # usually single-valued, but split defensively
            for vt in str(clnvc_raw).split("|"):
                vt = vt.strip()
                if vt:
                    clnvc_values.add(vt)

        if max_rows and (i + 1) >= max_rows:
            break

    return clnsig_values, clnrevstat_values, clnvc_values


In [6]:
clnsigs, revstats, clnvc_values = collect_unique_clnsig_revstat_clnvc(f"{DOWNLOAD_PATH}/clinvar.vcf")

print("CLNSIG unique values:", sorted(clnsigs))
print("CLNREVSTAT unique values:", sorted(revstats))
print("CLNVC unique values:", sorted(clnvc_values))

[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '2' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '3' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '4' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '5' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '6' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '7' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '8' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '9' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '10' is not defined in the header. (Quick workaroun

CLNSIG unique values: ['Affects', 'Benign', 'Benign/Likely_benign', 'Conflicting_classifications_of_pathogenicity', 'Established_risk_allele', 'Likely_benign', 'Likely_pathogenic', 'Likely_pathogenic,_low_penetrance', 'Likely_pathogenic/Likely_pathogenic,_low_penetrance', 'Likely_pathogenic/Likely_risk_allele', 'Likely_pathogenic/Pathogenic,_low_penetrance', 'Likely_risk_allele', 'Pathogenic', 'Pathogenic,_low_penetrance', 'Pathogenic/Likely_pathogenic', 'Pathogenic/Likely_pathogenic,_low_penetrance', 'Pathogenic/Likely_pathogenic/Likely_risk_allele', 'Pathogenic/Likely_pathogenic/Pathogenic,_low_penetrance', 'Pathogenic/Likely_risk_allele', 'Pathogenic/Pathogenic,_low_penetrance', 'Uncertain_risk_allele', 'Uncertain_significance', 'Uncertain_significance/Uncertain_risk_allele', 'association', 'association_not_found', 'confers_sensitivity', 'drug_response', 'no_classification_for_the_single_variant', 'no_classifications_from_unflagged_records', 'not_provided', 'other', 'protective', 'r

[W::vcf_parse] Contig 'Y' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'MT' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'NT_113889.1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'NT_187633.1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'NT_187661.1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'NT_187693.1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'NW_009646201.1' is not defined in the header. (Quick workaround: index the file with tabix.)


In [7]:
ALL_CLINSIG = [
  "Affects",
  "Benign",
  "Benign/Likely_benign",
  "Conflicting_classifications_of_pathogenicity",
  "Established_risk_allele",
  "Likely_benign",
  "Likely_pathogenic",
  "Likely_pathogenic,_low_penetrance",
  "Likely_pathogenic/Likely_risk_allele",
  "Likely_risk_allele",
  "Pathogenic",
  "Pathogenic/Likely_pathogenic",
  "Pathogenic/Likely_pathogenic/Likely_risk_allele",
  "Pathogenic/Likely_pathogenic/Pathogenic,_low_penetrance",
  "Pathogenic/Likely_risk_allele",
  "Pathogenic/Pathogenic,_low_penetrance",
  "Uncertain_risk_allele",
  "Uncertain_significance",
  "Uncertain_significance/Uncertain_risk_allele",
  "association",
  "association_not_found",
  "confers_sensitivity",
  "drug_response",
  "no_classification_for_the_single_variant",
  "no_classifications_from_unflagged_records",
  "not_provided",
  "other",
  "protective",
  "risk_factor"
]

In [8]:
ALL_CLNREVSTAT = [
  "criteria_provided,_conflicting_classifications",
  "criteria_provided,_multiple_submitters,_no_conflicts",
  "criteria_provided,_single_submitter",
  "no_assertion_criteria_provided",
  "no_classification_for_the_single_variant",
  "no_classification_provided",
  "no_classifications_from_unflagged_records",
  "practice_guideline",
  "reviewed_by_expert_panel"
]

In [9]:
from cyvcf2 import VCF
import pandas as pd

def clinvar_df_for_gene(vcf_path: str, gene_name: str, max_rows: int | None = None) -> pd.DataFrame:
    """
    Filter a ClinVar VCF for a given gene symbol and return a pandas DataFrame.

    Parameters
    ----------
    vcf_path : str
        Path to ClinVar VCF or .vcf.gz file.
    gene_name : str
        Gene symbol to match (case-insensitive), e.g. "BRCA1".
    max_rows : int | None
        Optional limit for preview/testing.

    Returns
    -------
    pd.DataFrame
        Columns include: CHROM, POS, ID, REF, ALT, RS, RS_prefixed,
        CLNSIG, GENEINFO, CLNVC, CLNREVSTAT, ORIGIN, ALLELEID
    """
    vcf = VCF(vcf_path)
    gene_name = gene_name.upper()
    records = []

    for v in vcf:
        info = v.INFO
        geneinfo = info.get("GENEINFO")
        if not geneinfo:
            continue

        # Parse e.g. "BRCA1:672|BRCA2:675" -> ["BRCA1", "BRCA2"]
        genes = [g.split(":")[0].upper() for g in geneinfo.split("|")]
        if gene_name not in genes:
            continue

        rs_raw = info.get("RS")
        rs_prefixed = None
        if rs_raw:
            # Sometimes RS may be a list-like string: "80357065,12345"
            tokens = [t.strip() for t in str(rs_raw).replace("|", ",").split(",") if t.strip()]
            rs_prefixed = [f"rs{t}" for t in tokens if t.isdigit()]
            if len(rs_prefixed) == 1:
                rs_prefixed = rs_prefixed[0]

        rec = {
            "CHROM": v.CHROM,
            "POS": v.POS,
            "ID": v.ID,                 # ClinVar internal variant ID
            "REF": v.REF,
            "ALT": v.ALT[0] if v.ALT else None,
            "RS": rs_raw,
            "RS_prefixed": rs_prefixed,
            "CLNSIG": info.get("CLNSIG"),
            "GENEINFO": geneinfo,
            "CLNVC": info.get("CLNVC"),
            "CLNREVSTAT": info.get("CLNREVSTAT"),
            "ORIGIN": info.get("ORIGIN"),
            "ALLELEID": info.get("ALLELEID"),
        }
        records.append(rec)

        if max_rows and len(records) >= max_rows:
            break

    return pd.DataFrame(records)


In [10]:
# All observed ClinVar classifications (CLNSIG) in your dataset
ALL_CLINSIG = [
    'Benign',
    'Pathogenic',
    'Uncertain_significance',
    'Likely_benign',
    'Conflicting_classifications_of_pathogenicity',
    'Benign/Likely_benign',
    'Likely_pathogenic',
    'Pathogenic/Likely_pathogenic',
    'not_provided',
    'no_classification_for_the_single_variant',
    'no_classifications_from_unflagged_records',
]

In [11]:
import pandas as pd

# Default variant classes (SNP-array–friendly)
# UPDATED: Added "Duplication" since it can appear as DD in genotyping data
DEFAULT_ALLOWED_TYPES = [
    "single_nucleotide_variant",
    "Indel",
    "Deletion",
    "Insertion",
    "Duplication",  # Added - can show as DD in SNP arrays
]

# Default clinical significance classes (disease-causing)
DEFAULT_CLINSIG = [
    "Pathogenic",
    "Likely_pathogenic",
    "Pathogenic/Likely_pathogenic",
]

# Default high-confidence review statuses
DEFAULT_REVSTAT = [
    "practice_guideline",
    "reviewed_by_expert_panel",
]

def clinvar_df_for_gene_filtered(
    vcf_path: str,
    gene_name: str,
    allowed_types: list[str] | None = None,
    allowed_clnsig: list[str] | None = None,
    allowed_revstat: list[str] | None = None,
    max_rows: int | None = None,
    case_insensitive: bool = True,
    dropna_clnvc: bool = True,
    dropna_clnsig: bool = True,
    dropna_revstat: bool = True,
) -> pd.DataFrame:
    """
    Call `clinvar_df_for_gene` and keep only rows whose CLNVC, CLNSIG, and CLNREVSTAT
    match the allowed lists.

    Parameters
    ----------
    vcf_path : str
        Path to ClinVar VCF/.vcf.gz.
    gene_name : str
        Gene symbol to match (e.g., "TP53").
    allowed_types : list[str] | None
        Variant classes to keep (CLNVC values). If None, uses DEFAULT_ALLOWED_TYPES.
    allowed_clnsig : list[str] | None
        Clinical significance categories to keep (CLNSIG values).
        If None, uses DEFAULT_CLINSIG.
    allowed_revstat : list[str] | None
        Review statuses to keep (CLNREVSTAT values).
        If None, uses DEFAULT_REVSTAT (practice_guideline, reviewed_by_expert_panel).
    max_rows : int | None
        Passed through to clinvar_df_for_gene for preview/testing.
    case_insensitive : bool
        If True, compare values case-insensitively.
    dropna_clnvc : bool
        If True, drop rows where CLNVC is NA before filtering.
    dropna_clnsig : bool
        If True, drop rows where CLNSIG is NA before filtering.
    dropna_revstat : bool
        If True, drop rows where CLNREVSTAT is NA before filtering.

    Returns
    -------
    pd.DataFrame
        Filtered DataFrame.
    """
    allowed_types = allowed_types or DEFAULT_ALLOWED_TYPES
    allowed_clnsig = allowed_clnsig or DEFAULT_CLINSIG
    allowed_revstat = allowed_revstat or DEFAULT_REVSTAT

    # You must have defined `clinvar_df_for_gene(vcf_path, gene_name, max_rows)` elsewhere.
    df = clinvar_df_for_gene(vcf_path, gene_name, max_rows=max_rows)

    # Drop NAs first (only if those columns exist)
    if dropna_clnvc and "CLNVC" in df.columns:
        df = df.dropna(subset=["CLNVC"])
    if dropna_clnsig and "CLNSIG" in df.columns:
        df = df.dropna(subset=["CLNSIG"])
    if dropna_revstat and "CLNREVSTAT" in df.columns:
        df = df.dropna(subset=["CLNREVSTAT"])

    # Build masks
    if case_insensitive:
        # Normalize allowed lists to lowercase sets
        type_norm = {t.lower() for t in allowed_types}
        sig_norm = {s.lower() for s in allowed_clnsig}
        rev_norm = {r.lower() for r in allowed_revstat}

        mask_type = df["CLNVC"].astype(str).str.lower().isin(type_norm) if "CLNVC" in df.columns else True
        mask_sig  = df["CLNSIG"].astype(str).str.lower().isin(sig_norm) if "CLNSIG" in df.columns else True
        mask_rev  = df["CLNREVSTAT"].astype(str).str.lower().isin(rev_norm) if "CLNREVSTAT" in df.columns else True
    else:
        mask_type = df["CLNVC"].isin(allowed_types) if "CLNVC" in df.columns else True
        mask_sig  = df["CLNSIG"].isin(allowed_clnsig) if "CLNSIG" in df.columns else True
        mask_rev  = df["CLNREVSTAT"].isin(allowed_revstat) if "CLNREVSTAT" in df.columns else True

    df_filtered = df[mask_type & mask_sig & mask_rev].reset_index(drop=True)
    return df_filtered

In [12]:
df_brca1 = clinvar_df_for_gene_filtered(
    vcf_path=vcf_path,
    gene_name="BRCA1",
    allowed_types=["single_nucleotide_variant"],
    allowed_clnsig=["Pathogenic"],
    allowed_revstat=["practice_guideline", "reviewed_by_expert_panel"]
)
print(f"Found {len(df_brca1)} BRCA1 variants")

Found 503 BRCA1 variants


In [13]:
df_brca1

Unnamed: 0,CHROM,POS,ID,REF,ALT,RS,RS_prefixed,CLNSIG,GENEINFO,CLNVC,CLNREVSTAT,ORIGIN,ALLELEID
0,17,43045711,55630,G,C,80357336,rs80357336,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,70297
1,17,43045711,55629,G,T,80357336,rs80357336,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,70296
2,17,43045728,266562,G,A,886040303,rs886040303,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,261566
3,17,43045729,55622,G,T,397509295,rs397509295,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,70289
4,17,43045734,55620,G,A,80356873,rs80356873,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,70287
...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,17,43124042,55638,G,A,397509299,rs397509299,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,70305
499,17,43124044,37664,A,G,80356929,rs80356929,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,46220
500,17,43124063,54902,G,A,80357134,rs80357134,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,69569
501,17,43124089,55746,A,C,397509332,rs397509332,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,70413


In [14]:
import pandas as pd

def count_clnrevstat(df: pd.DataFrame) -> pd.DataFrame:
    """
    Count every unique value in the CLNREVSTAT column of a ClinVar DataFrame.
    """
    if "CLNREVSTAT" not in df.columns:
        raise ValueError("Column 'CLNREVSTAT' not found in DataFrame")

    counts = df["CLNREVSTAT"].value_counts(dropna=False).reset_index()
    counts.columns = ["CLNREVSTAT", "count"]
    return counts


def count_clnsig(df: pd.DataFrame) -> pd.DataFrame:
    """
    Count every unique value in the CLNSIG column of a ClinVar DataFrame.
    """
    if "CLNSIG" not in df.columns:
        raise ValueError("Column 'CLNSIG' not found in DataFrame")

    counts = df["CLNSIG"].value_counts(dropna=False).reset_index()
    counts.columns = ["CLNSIG", "count"]
    return counts


def count_clnvc(df: pd.DataFrame) -> pd.DataFrame:
    """
    Count every unique value in the CLNVC (variant type) column of a ClinVar DataFrame.
    """
    if "CLNVC" not in df.columns:
        raise ValueError("Column 'CLNVC' not found in DataFrame")

    counts = df["CLNVC"].value_counts(dropna=False).reset_index()
    counts.columns = ["CLNVC", "count"]
    return counts


In [15]:
count_clnrevstat(df_brca1)

Unnamed: 0,CLNREVSTAT,count
0,reviewed_by_expert_panel,503


In [16]:
count_clnsig(df_brca1)

Unnamed: 0,CLNSIG,count
0,Pathogenic,503


In [17]:
count_clnvc(df_brca1)

Unnamed: 0,CLNVC,count
0,single_nucleotide_variant,503


In [18]:
def summarize_clinvar_counts(df: pd.DataFrame) -> None:
    """
    Print counts for CLNREVSTAT, CLNSIG, and CLNVC columns
    from a ClinVar DataFrame.
    """
    def _count_column(df, colname):
        if colname not in df.columns:
            print(f"⚠️ Column '{colname}' not found in DataFrame.\n")
            return pd.DataFrame()
        counts = df[colname].value_counts(dropna=False).reset_index()
        counts.columns = [colname, "count"]
        return counts

    print("=== CLNREVSTAT (Review Status) ===")
    print(_count_column(df, "CLNREVSTAT").to_string(index=False))
    print("\n")

    print("=== CLNSIG (Clinical Significance) ===")
    print(_count_column(df, "CLNSIG").to_string(index=False))
    print("\n")

    print("=== CLNVC (Variant Class) ===")
    print(_count_column(df, "CLNVC").to_string(index=False))
    print("\n")


In [19]:
summarize_clinvar_counts(df_brca1)

=== CLNREVSTAT (Review Status) ===
              CLNREVSTAT  count
reviewed_by_expert_panel    503


=== CLNSIG (Clinical Significance) ===
    CLNSIG  count
Pathogenic    503


=== CLNVC (Variant Class) ===
                    CLNVC  count
single_nucleotide_variant    503




In [20]:
import pandas as pd

def export_clinvar_tsv(df: pd.DataFrame, out_path: str = "./clinvar_export.tsv") -> pd.DataFrame:
    """
    Export a ClinVar gene DataFrame to a simplified TSV format similar to genotype tables.

    Output columns:
        rsid, gene, chromosome, position, ref, alt, clnrevstat, clnsig, clnvc

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame from `clinvar_df_for_gene` or similar.
    out_path : str
        Output path for the TSV file.

    Returns
    -------
    pd.DataFrame
        The subset DataFrame that was written to file.
    """
    # Flexible field mapping
    colmap = {
        "RS_prefixed": "rsid",
        "GENEINFO": "gene",
        "CHROM": "chromosome",
        "POS": "position",
        "REF": "ref",
        "ALT": "alt",
        "CLNREVSTAT": "clnrevstat",
        "CLNSIG": "clnsig",
        "CLNVC": "clnvc",
    }

    # Ensure all columns exist before mapping
    missing = [c for c in colmap if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns for export: {missing}")

    export_df = df[list(colmap.keys())].rename(columns=colmap)

    # Optional cleanup for readability
    export_df["gene"] = export_df["gene"].str.split(":").str[0]  # strip gene ID part
    export_df = export_df.fillna("")  # no NaNs in TSV

    export_df.to_csv(out_path, sep="\t", index=False)
    print(f"[clinvar] exported {len(export_df):,} rows to {out_path}")

    return export_df


In [21]:
df_brca1 = clinvar_df_for_gene_filtered(
    vcf_path=vcf_path,
    gene_name="BRCA1",
    allowed_types=["single_nucleotide_variant"],
    allowed_clnsig=["Pathogenic"],
    allowed_revstat=["practice_guideline", "reviewed_by_expert_panel"]
)
print(f"Found {len(df_brca1)} BRCA1 variants (now including Duplications)")

Found 503 BRCA1 variants (now including Duplications)


In [22]:
df_brca1

Unnamed: 0,CHROM,POS,ID,REF,ALT,RS,RS_prefixed,CLNSIG,GENEINFO,CLNVC,CLNREVSTAT,ORIGIN,ALLELEID
0,17,43045711,55630,G,C,80357336,rs80357336,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,70297
1,17,43045711,55629,G,T,80357336,rs80357336,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,70296
2,17,43045728,266562,G,A,886040303,rs886040303,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,261566
3,17,43045729,55622,G,T,397509295,rs397509295,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,70289
4,17,43045734,55620,G,A,80356873,rs80356873,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,70287
...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,17,43124042,55638,G,A,397509299,rs397509299,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,70305
499,17,43124044,37664,A,G,80356929,rs80356929,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,46220
500,17,43124063,54902,G,A,80357134,rs80357134,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,69569
501,17,43124089,55746,A,C,397509332,rs397509332,Pathogenic,BRCA1:672,single_nucleotide_variant,reviewed_by_expert_panel,1,70413


In [23]:
export_df = export_clinvar_tsv(df_brca1, "./work/brca1_clinvar.tsv")

[clinvar] exported 503 rows to ./work/brca1_clinvar.tsv


In [24]:
summarize_clinvar_counts(df_brca1)

=== CLNREVSTAT (Review Status) ===
              CLNREVSTAT  count
reviewed_by_expert_panel    503


=== CLNSIG (Clinical Significance) ===
    CLNSIG  count
Pathogenic    503


=== CLNVC (Variant Class) ===
                    CLNVC  count
single_nucleotide_variant    503




In [25]:
# 1) Either REF or ALT has length > 1
df_brca1[(df_brca1['REF'].str.len().gt(1).fillna(False)) | (df_brca1['ALT'].str.len().gt(1).fillna(False))]

Unnamed: 0,CHROM,POS,ID,REF,ALT,RS,RS_prefixed,CLNSIG,GENEINFO,CLNVC,CLNREVSTAT,ORIGIN,ALLELEID


In [26]:
# 2) Both > 1 AND the same length
df_brca1[
    (df_brca1['REF'].str.len().gt(1).fillna(False)) &
    (df_brca1['ALT'].str.len().gt(1).fillna(False)) &
    (df_brca1['REF'].str.len() == df_brca1['ALT'].str.len())
]

Unnamed: 0,CHROM,POS,ID,REF,ALT,RS,RS_prefixed,CLNSIG,GENEINFO,CLNVC,CLNREVSTAT,ORIGIN,ALLELEID


In [28]:
df_brca2 = clinvar_df_for_gene_filtered(
    vcf_path=vcf_path,
    gene_name="BRCA2",
    allowed_types=["single_nucleotide_variant"],
    allowed_clnsig=["Pathogenic"],
    allowed_revstat=["practice_guideline", "reviewed_by_expert_panel"]
)
print(f"Found {len(df_brca1)} BRCA1 variants (now including Duplications)")

Found 503 BRCA1 variants (now including Duplications)


In [29]:
df_brca2

Unnamed: 0,CHROM,POS,ID,REF,ALT,RS,RS_prefixed,CLNSIG,GENEINFO,CLNVC,CLNREVSTAT,ORIGIN,ALLELEID
0,13,32316463,51579,G,A,80358650,rs80358650,Pathogenic,BRCA2:675,single_nucleotide_variant,reviewed_by_expert_panel,1,66247
1,13,32316470,51063,G,T,397507571,rs397507571,Pathogenic,BRCA2:675,single_nucleotide_variant,reviewed_by_expert_panel,1,65731
2,13,32316497,51527,G,T,80358622,rs80358622,Pathogenic,BRCA2:675,single_nucleotide_variant,reviewed_by_expert_panel,1,66195
3,13,32316528,52161,G,T,81002796,rs81002796,Pathogenic,BRCA2:675,single_nucleotide_variant,reviewed_by_expert_panel,1,66829
4,13,32319080,52285,T,A,397507902,rs397507902,Pathogenic,BRCA2:675,single_nucleotide_variant,reviewed_by_expert_panel,1,66953
...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,13,32398252,267170,C,T,886040849,rs886040849,Pathogenic,BRCA2:675,single_nucleotide_variant,reviewed_by_expert_panel,1,261555
620,13,32398349,267174,T,A,886040852,rs886040852,Pathogenic,BRCA2:675,single_nucleotide_variant,reviewed_by_expert_panel,1,261558
621,13,32398396,52911,C,T,80359247,rs80359247,Pathogenic,BRCA2:675,single_nucleotide_variant,reviewed_by_expert_panel,3,67579
622,13,32398437,267177,C,A,4987049,rs4987049,Pathogenic,BRCA2:675,single_nucleotide_variant,reviewed_by_expert_panel,1,261561


In [30]:
# 2) Both > 1 AND the same length
df_brca2[
    (df_brca2['REF'].str.len().gt(1).fillna(False)) &
    (df_brca2['ALT'].str.len().gt(1).fillna(False)) &
    (df_brca2['REF'].str.len() == df_brca2['ALT'].str.len())
]

Unnamed: 0,CHROM,POS,ID,REF,ALT,RS,RS_prefixed,CLNSIG,GENEINFO,CLNVC,CLNREVSTAT,ORIGIN,ALLELEID


In [31]:
summarize_clinvar_counts(df_brca2)

=== CLNREVSTAT (Review Status) ===
              CLNREVSTAT  count
reviewed_by_expert_panel    624


=== CLNSIG (Clinical Significance) ===
    CLNSIG  count
Pathogenic    624


=== CLNVC (Variant Class) ===
                    CLNVC  count
single_nucleotide_variant    624




In [32]:
export_df = export_clinvar_tsv(df_brca2, "./work/brca2_clinvar.tsv")

[clinvar] exported 624 rows to ./work/brca2_clinvar.tsv


In [None]:
def search_rsid_in_clinvar(vcf_path: str, rsid: str) -> pd.DataFrame:
    """
    Search for all variants with a specific rsid in ClinVar VCF.
    
    Parameters
    ----------
    vcf_path : str
        Path to ClinVar VCF file
    rsid : str
        rsid to search for (with or without 'rs' prefix)
        
    Returns
    -------
    pd.DataFrame
        All matching records from ClinVar
    """
    # Remove 'rs' prefix if present for the RS field comparison
    rsid_num = rsid.replace('rs', '') if rsid.startswith('rs') else rsid
    
    vcf = VCF(vcf_path)
    records = []
    
    for v in vcf:
        info = v.INFO
        rs_raw = info.get("RS")
        
        if rs_raw:
            # RS can be comma-separated list
            rs_list = [t.strip() for t in str(rs_raw).replace("|", ",").split(",") if t.strip()]
            if rsid_num in rs_list:
                rec = {
                    "CHROM": v.CHROM,
                    "POS": v.POS,
                    "ID": v.ID,
                    "REF": v.REF,
                    "ALT": v.ALT[0] if v.ALT else None,
                    "RS": rs_raw,
                    "CLNSIG": info.get("CLNSIG"),
                    "GENEINFO": info.get("GENEINFO"),
                    "CLNVC": info.get("CLNVC"),
                    "CLNREVSTAT": info.get("CLNREVSTAT"),
                    "ORIGIN": info.get("ORIGIN"),
                    "ALLELEID": info.get("ALLELEID"),
                }
                records.append(rec)
    
    df = pd.DataFrame(records)
    if not df.empty:
        print(f"Found {len(df)} variant(s) for {rsid}:")
        for _, row in df.iterrows():
            print(f"  - chr{row['CHROM']}:{row['POS']} {row['REF']}->{row['ALT']} "
                  f"[{row['CLNSIG']}] [{row['CLNVC']}]")
    else:
        print(f"No variants found for {rsid}")
    
    return df

In [None]:
# Example: Search for rs80357336 in ClinVar
# This shows there are two different alt alleles (G->C and G->T) at the same position
df_rs80357336 = search_rsid_in_clinvar(vcf_path, "rs80357336")
df_rs80357336

In [None]:
# rs80357629
df_rs80357629 = search_rsid_in_clinvar(vcf_path, "rs80357629")
df_rs80357629

In [None]:
def search_position_in_clinvar(vcf_path: str, chromosome: str, position: int, show_all_fields: bool = False) -> pd.DataFrame:
    """
    Search for all variants at a specific chromosome:position in ClinVar VCF.
    This includes variants WITH and WITHOUT rsids.
    
    Parameters
    ----------
    vcf_path : str
        Path to ClinVar VCF file
    chromosome : str
        Chromosome (e.g., "17" or "chr17")
    position : int
        Genomic position
    show_all_fields : bool
        If True, show all INFO fields (useful for debugging)
        
    Returns
    -------
    pd.DataFrame
        All matching records from ClinVar
    """
    # Normalize chromosome (remove 'chr' prefix if present)
    chrom = chromosome.replace('chr', '') if chromosome.startswith('chr') else chromosome
    
    vcf = VCF(vcf_path)
    records = []
    
    for v in vcf:
        if v.CHROM == chrom and v.POS == position:
            info = v.INFO
            rs_raw = info.get("RS")
            
            # Add 'rs' prefix to RS field
            rs_prefixed = None
            if rs_raw:
                tokens = [t.strip() for t in str(rs_raw).replace("|", ",").split(",") if t.strip()]
                rs_prefixed = [f"rs{t}" for t in tokens if t.isdigit()]
                if len(rs_prefixed) == 1:
                    rs_prefixed = rs_prefixed[0]
                elif len(rs_prefixed) > 1:
                    rs_prefixed = ",".join(rs_prefixed)
                    
            rec = {
                "CHROM": v.CHROM,
                "POS": v.POS,
                "ID": v.ID,  # ClinVar internal ID
                "REF": v.REF,
                "ALT": v.ALT[0] if v.ALT else None,
                "RS": rs_raw,
                "RS_prefixed": rs_prefixed,
                "CLNSIG": info.get("CLNSIG"),
                "GENEINFO": info.get("GENEINFO"),
                "CLNVC": info.get("CLNVC"),
                "CLNREVSTAT": info.get("CLNREVSTAT"),
                "ORIGIN": info.get("ORIGIN"),
                "ALLELEID": info.get("ALLELEID"),
                "HAS_RSID": rs_raw is not None and rs_raw != "",
            }
            
            # Add all INFO fields if requested
            if show_all_fields:
                for key in info:
                    if key not in rec:
                        rec[f"INFO_{key}"] = info.get(key)
                        
            records.append(rec)
    
    df = pd.DataFrame(records)
    if not df.empty:
        print(f"Found {len(df)} variant(s) at chr{chrom}:{position}:")
        
        # Separate variants with and without rsids
        with_rsid = df[df['HAS_RSID'] == True]
        without_rsid = df[df['HAS_RSID'] == False]
        
        if len(with_rsid) > 0:
            print(f"\nVariants WITH rsid ({len(with_rsid)}):")
            for _, row in with_rsid.iterrows():
                rsid_str = row['RS_prefixed'] if row['RS_prefixed'] else f"RS:{row['RS']}"
                print(f"  - {rsid_str} {row['REF']}->{row['ALT']} "
                      f"[{row['CLNSIG']}] [{row['CLNVC']}] [{row['CLNREVSTAT']}]")
        
        if len(without_rsid) > 0:
            print(f"\nVariants WITHOUT rsid ({len(without_rsid)}):")
            for _, row in without_rsid.iterrows():
                print(f"  - ClinVarID:{row['ID']} {row['REF']}->{row['ALT']} "
                      f"[{row['CLNSIG']}] [{row['CLNVC']}] [{row['CLNREVSTAT']}]")
                
        # Show summary
        print(f"\nSummary:")
        print(f"  Total variants: {len(df)}")
        print(f"  With rsid: {len(with_rsid)}")
        print(f"  Without rsid: {len(without_rsid)}")
        print(f"  Unique variant types: {df['CLNVC'].dropna().unique().tolist()}")
        print(f"  Unique clinical significance: {df['CLNSIG'].dropna().unique().tolist()}")
    else:
        print(f"No variants found at chr{chrom}:{position}")
    
    return df

In [None]:
# Example 1: Search for all variants at position 17:43045711
# This will show ALL variants at this position, separating those with and without rsids
df_pos = search_position_in_clinvar(vcf_path, "17", 43045711)

# Show the dataframe
print("\nDataFrame contents:")
df_pos

In [None]:
# Example 3: Check if there are deletion variants at position 43045711
# Since carika.txt shows DD (deletion) at this position
df_deletions = df_pos[df_pos['CLNVC'].str.contains('Deletion|deletion', na=False, case=False)]
if len(df_deletions) > 0:
    print(f"Found {len(df_deletions)} deletion variant(s) at position 43045711:")
    for _, row in df_deletions.iterrows():
        rsid_str = row['RS_prefixed'] if row['RS_prefixed'] else f"ClinVarID:{row['ID']}"
        print(f"  {rsid_str}: {row['REF']}->{row['ALT']} [{row['CLNSIG']}]")
else:
    print("No deletion variants found at position 43045711 in ClinVar")
    print("This explains why the DD genotype from carika.txt doesn't match any variants")

In [None]:
def find_variants_by_type_in_gene(
    vcf_path: str,
    gene_name: str,
    variant_type: str,
    max_rows: int | None = None
) -> pd.DataFrame:
    """
    Find all variants of a specific type in a gene.
    
    Parameters
    ----------
    vcf_path : str
        Path to ClinVar VCF file
    gene_name : str
        Gene name (e.g., "BRCA1")
    variant_type : str
        Variant type to search for (e.g., "Deletion", "Insertion", "Duplication")
    max_rows : int | None
        Optional limit for testing
        
    Returns
    -------
    pd.DataFrame
        Matching variants
    """
    vcf = VCF(vcf_path)
    gene_name = gene_name.upper()
    records = []
    
    for v in vcf:
        info = v.INFO
        
        # Check gene
        geneinfo = info.get("GENEINFO", "")
        if gene_name not in geneinfo.upper():
            continue
            
        # Check variant type
        clnvc = info.get("CLNVC", "")
        if variant_type.lower() not in clnvc.lower():
            continue
            
        rs_raw = info.get("RS")
        rs_prefixed = None
        if rs_raw:
            tokens = [t.strip() for t in str(rs_raw).replace("|", ",").split(",") if t.strip()]
            rs_prefixed = [f"rs{t}" for t in tokens if t.isdigit()]
            if len(rs_prefixed) == 1:
                rs_prefixed = rs_prefixed[0]
            elif len(rs_prefixed) > 1:
                rs_prefixed = ",".join(rs_prefixed)
                
        rec = {
            "CHROM": v.CHROM,
            "POS": v.POS,
            "ID": v.ID,
            "REF": v.REF,
            "ALT": v.ALT[0] if v.ALT else None,
            "REF_LEN": len(v.REF),
            "ALT_LEN": len(v.ALT[0]) if v.ALT and v.ALT[0] else 0,
            "RS": rs_raw,
            "RS_prefixed": rs_prefixed,
            "CLNSIG": info.get("CLNSIG"),
            "CLNVC": clnvc,
            "CLNREVSTAT": info.get("CLNREVSTAT"),
            "HAS_RSID": rs_raw is not None and rs_raw != "",
        }
        records.append(rec)
        
        if max_rows and len(records) >= max_rows:
            break
            
    df = pd.DataFrame(records)
    
    if not df.empty:
        print(f"Found {len(df)} {variant_type} variant(s) in {gene_name}")
        print(f"  With rsid: {df['HAS_RSID'].sum()}")
        print(f"  Without rsid: {(~df['HAS_RSID']).sum()}")
        
        # Show size distribution for deletions
        if "deletion" in variant_type.lower():
            df['deletion_size'] = df['REF_LEN'] - df['ALT_LEN']
            size_dist = df['deletion_size'].value_counts().head(10)
            print(f"\nDeletion sizes (top 10):")
            for size, count in size_dist.items():
                print(f"  {size} bp: {count} variant(s)")
                
        # Show clinical significance breakdown
        clnsig_counts = df['CLNSIG'].value_counts()
        print(f"\nClinical significance:")
        for sig, count in clnsig_counts.items():
            print(f"  {sig}: {count}")
    else:
        print(f"No {variant_type} variants found in {gene_name}")
        
    return df

In [None]:
# Example 4: Find all deletion variants in BRCA1 to see what might be missing
df_brca1_deletions = find_variants_by_type_in_gene(
    vcf_path,
    gene_name="BRCA1",
    variant_type="Deletion",
    max_rows=100  # Limit for demo
)

# Check if any are at or near position 43045711
near_position = df_brca1_deletions[
    (df_brca1_deletions['POS'] >= 43045700) & 
    (df_brca1_deletions['POS'] <= 43045720)
]
if len(near_position) > 0:
    print(f"\nDeletions near position 43045711:")
    for _, row in near_position.iterrows():
        rsid_str = row['RS_prefixed'] if row['RS_prefixed'] else f"ClinVarID:{row['ID']}"
        print(f"  {rsid_str} at {row['POS']}: {row['REF'][:10]}... -> {row['ALT'][:10]}... [{row['CLNSIG']}]")
else:
    print(f"\nNo deletions found near position 43045711 in ClinVar BRCA1 data")

In [33]:
df_atm = clinvar_df_for_gene_filtered(
    vcf_path=vcf_path,
    gene_name="ATM",
    allowed_types=["single_nucleotide_variant"],
    allowed_clnsig=["Pathogenic"],
    allowed_revstat=["practice_guideline", "reviewed_by_expert_panel"]
)
print(f"Found {len(df_brca1)} ATM variants (now including Duplications)")

Found 503 ATM variants (now including Duplications)


In [36]:
df_atm

Unnamed: 0,CHROM,POS,ID,REF,ALT,RS,RS_prefixed,CLNSIG,GENEINFO,CLNVC,CLNREVSTAT,ORIGIN,ALLELEID
0,11,108227626,187275,T,C,786203606.0,rs786203606,Pathogenic,ATM:472,single_nucleotide_variant,reviewed_by_expert_panel,1,183068
1,11,108227691,232248,C,T,746235533.0,rs746235533,Pathogenic,ATM:472,single_nucleotide_variant,reviewed_by_expert_panel,3,233887
2,11,108235669,231535,G,A,747855862.0,rs747855862,Pathogenic,ATM:472,single_nucleotide_variant,reviewed_by_expert_panel,1,233910
3,11,108235805,634428,G,A,876658159.0,rs876658159,Pathogenic,ATM:472,single_nucleotide_variant,reviewed_by_expert_panel,5,622397
4,11,108244873,216024,C,T,772821016.0,rs772821016,Pathogenic,ATM:472,single_nucleotide_variant,reviewed_by_expert_panel,1,212837
5,11,108250861,233553,C,T,876660485.0,rs876660485,Pathogenic,ATM:472,single_nucleotide_variant,reviewed_by_expert_panel,1,233978
6,11,108250907,453367,T,G,1555070980.0,rs1555070980,Pathogenic,ATM:472,single_nucleotide_variant,reviewed_by_expert_panel,1,461289
7,11,108251073,220555,G,T,772926890.0,rs772926890,Pathogenic,ATM:472,single_nucleotide_variant,reviewed_by_expert_panel,1,222042
8,11,108257479,4056347,A,T,,,Pathogenic,ATM:472,single_nucleotide_variant,reviewed_by_expert_panel,1,4170285
9,11,108259022,216021,C,T,780619951.0,rs780619951,Pathogenic,ATM:472,single_nucleotide_variant,reviewed_by_expert_panel,19,212851


In [37]:
export_df = export_clinvar_tsv(df_atm, "./work/atm_clinvar.tsv")

[clinvar] exported 34 rows to ./work/atm_clinvar.tsv
