In [1]:
#!pip install cyvcf2


In [13]:
from cyvcf2 import VCF
import pandas as pd


In [14]:
ann_fields = [
    "Allele", "Annotation", "Annotation_Impact", "Gene_Name",
    "Gene_ID", "Feature_Type", "Feature_ID", "Transcript_Biotype",
    "Rank", "HGVS.c", "HGVS.p", "cDNA.pos", "CDS.pos", "AA.pos",
    "Distance", "Errors_Warnings_Info"
]


In [15]:
def parse_snpEff_ann(vcf_path):
    vcf = VCF(vcf_path)
    rows = []

    for var in vcf:
        chrom = var.CHROM
        pos = var.POS
        ref = var.REF
        alt = var.ALT[0]
        qual = var.QUAL

        info_dp = var.INFO.get("DP")

        anns = var.INFO.get("ANN")
        if anns is None:
            continue

        # ANN can have multiple comma-separated annotations
        for ann in anns.split(","):
            fields = ann.split("|")
            fields = fields + [""] * (len(ann_fields) - len(fields))

            row = {
                "CHROM": chrom,
                "POS": pos,
                "REF": ref,
                "ALT": alt,
                "QUAL": qual,
                "DP": info_dp,
            }

            for k, v in zip(ann_fields, fields):
                row[k] = v

            rows.append(row)

    return pd.DataFrame(rows)


In [16]:
df = parse_snpEff_ann("all_annotated_sample.vcf")
df.head()


Unnamed: 0,CHROM,POS,REF,ALT,QUAL,DP,Allele,Annotation,Annotation_Impact,Gene_Name,...,Feature_ID,Transcript_Biotype,Rank,HGVS.c,HGVS.p,cDNA.pos,CDS.pos,AA.pos,Distance,Errors_Warnings_Info
0,Contig1453,127,T,TGG,170.490005,13,TGG,frameshift_variant,HIGH,Bma000001.1,...,Bma000001.1,protein_coding,1/1,c.127_128dupGG,p.Asn44fs,129/954,129/954,43/317,,WARNING_TRANSCRIPT_NO_START_CODON&INFO_REALIGN...
1,Contig1453,127,T,TGG,170.490005,13,TGG,downstream_gene_variant,MODIFIER,Bma000002.1,...,Bma000002.1,protein_coding,,c.*1976_*1977dupCC,,,,,1977.0,
2,Contig1453,127,T,TGG,170.490005,13,TGG,downstream_gene_variant,MODIFIER,Bma000003.1,...,Bma000003.1,protein_coding,,c.*3531_*3532dupCC,,,,,3532.0,
3,Contig1453,143,A,C,218.990005,15,C,synonymous_variant,LOW,Bma000001.1,...,Bma000001.1,protein_coding,1/1,c.141A>C,p.Thr47Thr,141/954,141/954,47/317,,WARNING_TRANSCRIPT_NO_START_CODON
4,Contig1453,143,A,C,218.990005,15,C,downstream_gene_variant,MODIFIER,Bma000002.1,...,Bma000002.1,protein_coding,,c.*1962T>G,,,,,1962.0,


In [17]:
# Filter impactful variants
impact_df = df[df["Annotation_Impact"].isin(["MODERATE", "HIGH"])]
impact_df.head()

Unnamed: 0,CHROM,POS,REF,ALT,QUAL,DP,Allele,Annotation,Annotation_Impact,Gene_Name,...,Feature_ID,Transcript_Biotype,Rank,HGVS.c,HGVS.p,cDNA.pos,CDS.pos,AA.pos,Distance,Errors_Warnings_Info
0,Contig1453,127,T,TGG,170.490005,13,TGG,frameshift_variant,HIGH,Bma000001.1,...,Bma000001.1,protein_coding,1/1,c.127_128dupGG,p.Asn44fs,129/954,129/954,43/317,,WARNING_TRANSCRIPT_NO_START_CODON&INFO_REALIGN...
21,Contig1453,217,C,A,288.549988,39,A,missense_variant,MODERATE,Bma000001.1,...,Bma000001.1,protein_coding,1/1,c.215C>A,p.Ala72Glu,215/954,215/954,72/317,,WARNING_TRANSCRIPT_NO_START_CODON
33,Contig1453,231,G,T,151.899994,60,T,stop_gained,HIGH,Bma000001.1,...,Bma000001.1,protein_coding,1/1,c.229G>T,p.Glu77*,229/954,229/954,77/317,,WARNING_TRANSCRIPT_NO_START_CODON
36,Contig1453,232,A,C,151.580002,61,C,missense_variant,MODERATE,Bma000001.1,...,Bma000001.1,protein_coding,1/1,c.230A>C,p.Glu77Ala,230/954,230/954,77/317,,WARNING_TRANSCRIPT_NO_START_CODON
39,Contig1453,236,A,C,305.779999,62,C,missense_variant,MODERATE,Bma000001.1,...,Bma000001.1,protein_coding,1/1,c.234A>C,p.Leu78Phe,234/954,234/954,78/317,,WARNING_TRANSCRIPT_NO_START_CODON


In [18]:
# Count mutations per gene
df["Gene_ID"].value_counts()


Gene_ID
null           75
null.2         75
null.3         75
null-null.2    18
Name: count, dtype: int64

In [None]:
# Annotation = functional effect (missense_variant, synonymous_variant, stop_gained, intron_variant, upstream_gene_variant).
# Annotation_Impact= impact category (HIGH, MODERATE, LOW, MODIFIER)
# Gene_name= might match GFF attributes (ex: Bma002878.1).
# Gene_ID= gene identifier from GFF attributes.
# Feature_Type= type of sequence affected (transcript, mRNA, CDS, exon)
# Feature_ID=ID of transcript/feature affected (Bma002878.1).
# Transcript_Biotype=class transcript GFF (protein_coding, lncRNA, pseudogene). 
# Rank=exons or transcripts (<exon_number>/<total_exons>, ex: the variant occurs in exon 3 of 7).
# HGVS.c=HGVS notation at the coding DNA level.
# HGVS.p=HGVS notation at the protein level.
# cDNA.pos=Position in the cDNA sequence (including UTRs). Format: <pos>/<length>, Example: 678/1450
# CDS.pos=Position within the coding sequence only.If the mutation is outside the CDS (e.g., UTR, intron), this is empty.
# AA.pos=Position within the protein sequence. Empty for noncoding variants.
# Distance=Distance to the nearest feature if the variant is not inside a gene/transcript.
# Errors_Warnings_Info=Any warnings, errors, or special flags snpEff adds (WARNING_TRANSCRIPT_NO_START_CODON, WARNING_TRANSCRIPT_INCOMPLETE, ERROR_CHROMOSOME_NOT_FOUND, INFO_REALIGN_3_PRIME).

