In [None]:
# Ch06-3 - Genes & Variant impacts

In [None]:
# Intersect variants.vcf with Genbank file and determine effect on coding frame
#  Also includes check for amino acid change
#  Graph the variants with their impact and the type of change along the Genome

In [None]:
# Import Librarires
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Seq import Seq
import matplotlib.pyplot as plt

In [None]:
# Define file paths
genbank_file = "output/ecoli_prodigal_combined.gb"
vcf_file = "input/variants.vcf"
output_log_file = "output/variant_analysis.log"

In [None]:
# Function to parse VCF file and extract variants
def parse_vcf(vcf_file):
    variants = []
    with open(vcf_file, "r") as vcf:
        for line in vcf:
            if line.startswith("#"):
                continue
            fields = line.strip().split("\t")
            chrom = fields[0]
            pos = int(fields[1]) - 1  # Convert to 0-based index
            ref = fields[3]
            alt = fields[4]
            variants.append((chrom, pos, ref, alt))
    return variants

In [None]:
# Function to determine if a variant changes the coding frame
def changes_coding_frame(ref, alt):
    return (len(ref) - len(alt)) % 3 != 0

In [None]:
# Function to determine if a variant introduces an amino acid change
def introduces_amino_acid_change(cds_sequence, ref, alt, position_in_cds):
    try:
        # Ensure the reference matches the expected sequence
        if cds_sequence[position_in_cds:position_in_cds + len(ref)] != ref:
            raise ValueError("Reference allele does not match CDS at the specified position.")
        # Create original and mutated CDS
        original_cds = cds_sequence
        mutated_cds = cds_sequence[:position_in_cds] + alt + cds_sequence[position_in_cds + len(ref):]
        # Translate both sequences to proteins
        original_protein = Seq(original_cds).translate(to_stop=True)
        mutated_protein = Seq(mutated_cds).translate(to_stop=True)
        return original_protein != mutated_protein
    except Exception as e:
        with open(output_log_file, "a") as log:
            log.write(f"Error processing variant at position {position_in_cds + 1}: {e}\n")
        return False

In [None]:
# Plotting function
def plot_variant_changes(variant_data):
    positions = [data[0] for data in variant_data]
    change_types = [data[1] for data in variant_data]

    plt.figure(figsize=(12, 6))
    plt.scatter(positions, change_types, alpha=0.7, edgecolors="k")
    plt.xlabel("Position in Genome", fontsize=12)
    plt.ylabel("Type of Change", fontsize=12)
    plt.title("Variant Type vs. Position in Genome", fontsize=14, fontweight="bold")
    plt.yticks(ticks=[0, 1, 2], labels=["No Change", "Frame Change", "AA Change"])
    plt.grid(alpha=0.5)
    plt.show()

In [None]:
# Parse the VCF file
variants = parse_vcf(vcf_file)

In [None]:
# Parse the GenBank file and check for intersecting variants
variant_data = []
with open(output_log_file, "w") as log:
    with open(genbank_file, "r") as gb_file:
        for record in SeqIO.parse(gb_file, "genbank"):
            for feature in record.features:
                if feature.type == "CDS":
                    cds_start = int(feature.location.start)
                    cds_end = int(feature.location.end)
                    cds_sequence = str(feature.extract(record.seq))
                    for chrom, pos, ref, alt in variants:
                        if chrom == record.id and cds_start <= pos < cds_end:
                            in_cds = True
                            frame_change = changes_coding_frame(ref, alt)
                            position_in_cds = pos - cds_start
                            amino_acid_change = introduces_amino_acid_change(cds_sequence, ref, alt, position_in_cds)
                            change_type = 2 if amino_acid_change else (1 if frame_change else 0)
                            variant_data.append((pos + 1, change_type))  # Store position and change type
                            log.write(f"Variant at position {pos + 1} (Ref: {ref}, Alt: {alt}) intersects CDS ({cds_start + 1}-{cds_end}).\n")
                            if frame_change:
                                log.write("\tThis variant changes the coding frame.\n")
                            else:
                                log.write("\tThis variant does not change the coding frame.\n")
                            if amino_acid_change:
                                log.write("\tThis variant introduces an amino acid change.\n")
                            else:
                                log.write("\tThis variant does not introduce an amino acid change.\n")

In [None]:
# Plot the variant changes
plot_variant_changes(variant_data)

In [None]:
## Optional Exercise - SnpEff ##

In [None]:
# Install SnpEff #
# Download latest version
! wget https://snpeff.blob.core.windows.net/versions/snpEff_latest_core.zip
# Unzip file
! unzip snpEff_latest_core.zip

In [None]:
## SnpEff example on Human ##  
java -Xmx4g -jar snpEff.jar download GRCh38.99
# create human_variants.vcf in /inputs
java -Xmx4g -jar snpEff/snpEff.jar GRCh38.99 snpEff/examples/test.vcf > output/human_annotated_variants.vcf

In [None]:
## End of Notebook ###