In [1]:
#!pip install cyvcf2


In [2]:
from cyvcf2 import VCF
import pandas as pd


In [3]:
ann_fields = [
    "Allele", "Annotation", "Annotation_Impact", "Gene_Name",
    "Gene_ID", "Feature_Type", "Feature_ID", "Transcript_Biotype",
    "Rank", "HGVS.c", "HGVS.p", "cDNA.pos", "CDS.pos", "AA.pos",
    "Distance", "Errors_Warnings_Info"
]


In [None]:
def parse_snpEff_ann(vcf_path):
    vcf = VCF(vcf_path)
    rows = []

    for var in vcf:
        chrom = var.CHROM
        pos = var.POS
        ref = var.REF
        alt = var.ALT[0]
        qual = var.QUAL

        info_dp = var.INFO.get("DP")

        anns = var.INFO.get("ANN")
        if anns is None:
            continue

        # ANN can have multiple comma-separated annotations
        for ann in anns.split(","):
            fields = ann.split("|")
            fields = fields + [""] * (len(ann_fields) - len(fields))

            row = {
                "CHROM": chrom,
                "POS": pos,
                "REF": ref,
                "ALT": alt,
                "QUAL": qual,
                "DP": info_dp,
            }

            for k, v in zip(ann_fields, fields):
                row[k] = v

            rows.append(row)

    return pd.DataFrame(rows)


In [None]:
df = parse_snpEff_ann("all_annotated_sample.vcf")
df.head()


In [None]:
# Filter impactful variants
impact_df = df[df["Annotation_Impact"].isin(["MODERATE", "HIGH"])]
impact_df.head()

In [None]:
# Count mutations per gene
df["Gene_ID"].value_counts()


In [None]:
# Annotation = functional effect (missense_variant, synonymous_variant, stop_gained, intron_variant, upstream_gene_variant).
# Annotation_Impact= impact category (HIGH, MODERATE, LOW, MODIFIER)
# Gene_name= might match GFF attributes (ex: Bma002878.1).
# Gene_ID= gene identifier from GFF attributes.
# Feature_Type= type of sequence affected (transcript, mRNA, CDS, exon)
# Feature_ID=ID of transcript/feature affected (Bma002878.1).
# Transcript_Biotype=class transcript GFF (protein_coding, lncRNA, pseudogene). 
# Rank=exons or transcripts (<exon_number>/<total_exons>, ex: the variant occurs in exon 3 of 7).
# HGVS.c=HGVS notation at the coding DNA level.
# HGVS.p=HGVS notation at the protein level.
# cDNA.pos=Position in the cDNA sequence (including UTRs). Format: <pos>/<length>, Example: 678/1450
# CDS.pos=Position within the coding sequence only.If the mutation is outside the CDS (e.g., UTR, intron), this is empty.
# AA.pos=Position within the protein sequence. Empty for noncoding variants.
# Distance=Distance to the nearest feature if the variant is not inside a gene/transcript.
# Errors_Warnings_Info=Any warnings, errors, or special flags snpEff adds (WARNING_TRANSCRIPT_NO_START_CODON, WARNING_TRANSCRIPT_INCOMPLETE, ERROR_CHROMOSOME_NOT_FOUND, INFO_REALIGN_3_PRIME).

