In [None]:
# Ch06-1 Variant Annotation

In [None]:
# Copy over variants file and set up directories
! mkdir -p input
! mkdir -p output
! mkdir -p data
! cp ../Ch05/output/variants.vcf input/ 

In [None]:
# Install cyvcf2
! pip install cyvcf2

In [None]:
# 1. Filter variants based on quality using cyvcf2
from cyvcf2 import VCF, Writer

In [None]:
# Function to filter a vcf based on quality and/or chromosome 
def filter_vcf(input_vcf, output_vcf, min_quality=30, chrom_filter=None):
    """
    Filter variants in a VCF file based on quality
    Parameters:
        input_vcf (str): Path to the input VCF file
        output_vcf (str): Path to the output filtered VCF file
        min_quality (float): Minimum quality score to retain a variant
        chrom_filter (list): List of chromosomes to include (e.g., ['chr1', 'chr2'])
    """
    # Open the input VCF file
    vcf = VCF(input_vcf)
    # Create a writer for the output VCF
    writer = Writer(output_vcf, vcf)
    for variant in vcf:
        # Filter based on quality
        if variant.QUAL is not None and variant.QUAL < min_quality:
            continue
        # Filter based on chromosome
        if chrom_filter and variant.CHROM not in chrom_filter:
            continue
        # Write the variant to the output VCF
        writer.write_record(variant)
    # Close the VCF reader and writer
    vcf.close()
    writer.close()
    print(f"Filtered VCF written to: {output_vcf}")

In [None]:
# Main function to all our filtering function on an input vcf file and set filtering criteria
def main():
    input_vcf = "input/variants.vcf"  # Path to the input VCF file
    output_vcf = "output/filtered_variants.vcf"  # Path to the output filtered VCF file
    min_quality = 30  # Minimum quality score
    chrom_filter = ["NC_000913.3"]  # E. coli chromosome
    filter_vcf(input_vcf, output_vcf, min_quality, chrom_filter)
if __name__ == "__main__":
    main()

In [None]:
## 2. Graph Allele Frequencies using cyvcf2 ##

In [None]:
# Import Libraries #
from cyvcf2 import VCF
import matplotlib.pyplot as plt

In [None]:
# Function to plot allele frequency
def plot_allele_frequency(vcf_file, output_file=None):
    """
    Plot the allele frequency of variants across the genome.
    Parameters:
        vcf_file (str): Path to the VCF file.
        output_file (str): Path to save the plot (optional).
    """
    # Initialize lists to store positions and allele frequencies
    chrom_positions = []
    allele_frequencies = []
    # Open the VCF file
    vcf = VCF(vcf_file)
    # Iterate over variants in the VCF
    for variant in vcf:
        # Extract allele frequency from INFO field
        af = variant.INFO.get("AF")
        if af is not None:
            if isinstance(af, (list, tuple)):  # Handle multiple allele frequencies
                for freq in af:
                    allele_frequencies.append(float(freq))
                    chrom_positions.append((variant.CHROM, variant.POS))
            else:  # Single allele frequency
                allele_frequencies.append(float(af))
                chrom_positions.append((variant.CHROM, variant.POS))
    # Close the VCF file
    vcf.close()
    # Prepare data for plotting
    chrom_names = sorted(set(chrom for chrom, _ in chrom_positions))   
    chrom_offsets = {chrom: i * 1e6 for i, chrom in enumerate(chrom_names)}  # Chromosome offsets for spacing
    plot_positions = [chrom_offsets[chrom] + pos for chrom, pos in chrom_positions]
    # Ensure x and y have the same length
    assert len(plot_positions) == len(allele_frequencies), "Mismatch between positions and frequencies!"
    # Create the plot
    plt.figure(figsize=(12, 6))
    plt.scatter(plot_positions, allele_frequencies, alpha=0.5, s=10, label="Allele Frequency")
    plt.xlabel("Genomic Position (Chromosomes)")
    plt.ylabel("Allele Frequency")
    plt.title("Allele Frequency Across the Genome")
    plt.xticks(
        [chrom_offsets[chrom] for chrom in chrom_names],
        labels=chrom_names,
        rotation=45
    )
    plt.grid(True)
    plt.legend()
    # Save or show the plot
    if output_file:
        plt.savefig(output_file, dpi=300, bbox_inches="tight")
        print(f"Plot saved to: {output_file}")
    else:
        plt.show()

In [None]:
# main function
def main():
    vcf_file = "input/variants.vcf"  # Replace with your VCF file
    output_file = "output/allele_frequency_plot.png"  # Path to save the plot (optional)
    # Plot allele frequency
    plot_allele_frequency(vcf_file, output_file)
if __name__ == "__main__":
    main()

In [None]:
## 3. Plot variant types with cyvcf2 ##

In [None]:
# Import Libraries #
from cyvcf2 import VCF
import matplotlib.pyplot as plt
from collections import Counter

In [None]:
def categorize_variant(variant):
    """
    Categorize a variant as SNP, insertion, or deletion.
    Parameters:
        variant: A cyvcf2.Variant object.
    Returns:
        str: The variant type (e.g., "SNP", "Insertion", "Deletion").
    """
    ref_len = len(variant.REF)
    alt_len = max(len(alt) for alt in variant.ALT)
    if ref_len == 1 and alt_len == 1:
        return "SNP"
    elif ref_len < alt_len:
        return "Insertion"
    elif ref_len > alt_len:
        return "Deletion"
    else:
        return "Other"

In [None]:
# Plotting function - build a pie chart showing the percentage of different variant types
def plot_variant_types(vcf_file, output_file=None):
    """
    Parse a VCF file and plot the distribution of variant types.
    Parameters:
        vcf_file (str): Path to the input VCF file.
        output_file (str): Path to save the plot (optional).
    """
    # Initialize a counter for variant types
    variant_counts = Counter()
    # Open the VCF file and categorize variants
    print(f"Processing VCF file: {vcf_file}")
    vcf = VCF(vcf_file)
    for variant in vcf:
        variant_type = categorize_variant(variant)
        variant_counts[variant_type] += 1
    vcf.close()
    # Extract data for plotting
    labels = list(variant_counts.keys())
    sizes = list(variant_counts.values())
    # Create a pie chart
    plt.figure(figsize=(10, 8))
    wedges, _, autotexts = plt.pie(
        sizes, autopct="%1.1f%%", startangle=140,
        colors=plt.cm.tab10.colors, wedgeprops={"edgecolor": "black", "linewidth": 1.5}
    )
    # Add a legend
    plt.legend(
        wedges, labels,
        title="Variant Types",
        loc="center left",
        bbox_to_anchor=(1, 0.5),
        frameon=False
    )
    plt.title("Variant Type Distribution", fontsize=14, fontweight="bold")
    # Save or show the plot
    if output_file:
        plt.savefig(output_file, dpi=300, bbox_inches="tight")
        print(f"Plot saved to: {output_file}")
    else:
        plt.show()

In [None]:
# Main function to classify variants and plot the results
def main():
    # Define input and output file paths
    vcf_file = "input/variants.vcf"  # Replace with your VCF file
    output_file = "output/variant_type_distribution.png"  # Optional output file
    # Plot the variant type distribution
    plot_variant_types(vcf_file, output_file)
if __name__ == "__main__":
    main()

In [None]:
## 4. Introducing a Missense mutation in a Gene ## 

In [None]:
# Import libraries
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation

In [None]:
# Function to introduce a change in the sequence
def introduce_variant(sequence, position, ref, alt):
    """
    Introduce a variant into the DNA sequence.
    Parameters:
        sequence (str): The original DNA sequence.
        position (int): The 0-based index of the position to mutate.
        ref (str): The reference base(s) at the position.
        alt (str): The alternate base(s) to replace the reference.
    Returns:
        str: The modified DNA sequence.
    """
    # Replace the reference base with the alternate base
    return sequence[:position] + alt + sequence[position + len(ref):]

In [None]:
# Demonstrate the functional change
# Define the original gene sequence
gene_sequence = Seq("ATGGCATTTGACTGGTAA")  # Example gene (encodes "MAFDW*" in protein)
# Simulate a variant: change TTT (Phe) at position 3 to TGT (Cys)
variant_position = 3  # 0-based index
reference_codon = "TTT"
alternate_codon = "TGT"
print("Original Gene Sequence:")
print(gene_sequence)
# Introduce the variant
mutated_sequence = introduce_variant(str(gene_sequence), variant_position, reference_codon, alternate_codon)
print("\nMutated Gene Sequence:")
print(mutated_sequence)
# Translate the original and mutated sequences
original_protein = gene_sequence.translate()
mutated_protein = Seq(mutated_sequence).translate()
print("\nOriginal Protein Sequence:")
print(original_protein)
print("\nMutated Protein Sequence:")
print(mutated_protein)

In [None]:
## End of Notebook ##