# Align CARD Resistance Genes to Escherichia Coli Genomes

### Data Preparation

### Imports

In [1]:
!pip install pysam biopython

Collecting pysam
  Downloading pysam-0.22.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.5 kB)
Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading pysam-0.22.1-cp310-cp310-manylinux_2_28_x86_64.whl (22.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.0/22.0 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pysam, biopython
Successfully installed biopython-1.84 pysam-0.22.1


In [2]:
import os
import subprocess

import pandas as pd
import pysam
from Bio import SeqIO

### 1. Use Subprocess to build a Bowtie Index for some Sample Genomes

- Bowtie2 is an alignment tool installed through Biopython
- It allows you to efficiently search for small genomic sequences within larger sequences
- To do this Bowtie creates an INDEX of the sequences to search within (in our case the genome assemblies)
- You will use bowtie to search for CARD Resistance sequences in a few example Genomes

Documentation for Bowtie2 building an Index: https://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-build-indexer

In [10]:
extraction_path = '/content/genomes_subset'
print(os.listdir(extraction_path))

['genomes_subset', '__MACOSX']


In [3]:
!apt-get install -y bowtie

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Suggested packages:
  bowtie-examples
The following NEW packages will be installed:
  bowtie
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 1,597 kB of archives.
After this operation, 5,967 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 bowtie amd64 1.3.1-1 [1,597 kB]
Fetched 1,597 kB in 1s (2,182 kB/s)
Selecting previously unselected package bowtie.
(Reading database ... 123599 files and directories currently installed.)
Preparing to unpack .../bowtie_1.3.1-1_amd64.deb ...
Unpacking bowtie (1.3.1-1) ...
Setting up bowtie (1.3.1-1) ...
Processing triggers for man-db (2.10.2-1) ...


In [17]:
import zipfile


# Step 1: Unzip the file
zip_file_path = '/content/genomes_subset.zip'
extraction_path = '/content/genomes_subset'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

# Step 2: List contents of the unzipped genomes_subset directory
genomes_path = os.path.join(extraction_path, 'genomes_subset')
# Set up filepaths to access genomes and make empty directories for outputs
all_genome_fastas = [x for x in os.listdir(genomes_path) if x.endswith('.fna')]
os.makedirs('../genome_index/', exist_ok=True)
os.makedirs('../content/alignments/', exist_ok=True)

# For 5 genomes - build a bowtie index for each one to allow efficient searching
for fasta in all_genome_fastas[0:5]:

    # Get just the sequencing ID from the FASTA filepath
    seq_id = fasta.replace('.fna','')

    # Set up filepaths (absolute to account for different locations)
    genome_path = '../genomes_subset/' + fasta
    genome_abspath = os.path.abspath(genome_path)
    output_path = '../genome_index/' + seq_id
    output_abspath = os.path.abspath(output_path)

    # Use subprocess to build an index per sample genome
    # Hint: the subprocess will be [bowtie command (see documentation), path_to_genomes, path_to_output]
    subprocess.run(
        ['bowtie-build', genome_abspath, output_abspath],
        capture_output=False,
        stdout=subprocess.DEVNULL,
        stderr=subprocess.STDOUT,
    )

##### Take a look in data/genome_index

- Each of the 5 Genome assemblies has 6 files associated
- These are Bowtie index files and are what allow for fast searching of sequences
- Without this the process of alignment would take far too long

### 2. Use Subprocess to Run Bowtie2 Alignment from CARD Genes to each of the 5 Assemblies

Documentation for Bowtie2 alignment methods (tip: check out the command line section for hints): https://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-aligner

In [18]:
# Step 1: Unzip the file
zip_file_path = '/content/card_data.zip'
extraction_path = '/content/card_data'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

# Step 2: List contents of the unzipped genomes_subset directory
genomes_path = os.path.join(extraction_path, 'nucleotide_combined_model.fasta')
# Set up filepaths for CARD data

card_genes_path = '../card_data/nucleotide_combined_model.fasta'
card_genes_abspath = os.path.abspath(card_genes_path)

# Loop through our 5 test genomes
for fasta in card_genes_abspath[0:5]:

    # Access the bowtie index for the current genome
    seq_id = fasta.replace('.fna','')
    genome_index_path = '../genome_index/' + seq_id
    genome_index_abspath = os.path.abspath(genome_index_path)

    # Set up a path to save out the alignment results
    output_path = '/content/alignments' + seq_id + '.sam'
    output_abspath = os.path.abspath(output_path)

    # Run the bowtie alignment (check the documentation for values)
    subprocess.run(['bowtie-build',
        '-x', genome_index_abspath,
        '-f', card_genes_abspath,
        '-S',output_abspath ,
        '-N', '1'
    ])

In [24]:
zip_file_path = '/content/card_data.zip'
extraction_path = '/content/card_data'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)
#step-2
card_genes_path = os.path.join(extraction_path, '/content/card_data/card_data/nucleotide_combined_model.fasta')
card_genes_abspath = os.path.abspath(card_genes_path)
#step-3
for fasta in all_genome_fastas[0:5]:

    # Access the Bowtie index for the current genome
    seq_id = fasta.replace('.fna', '')
    genome_index_path = os.path.join('/content/genome_index', seq_id)

    # Set up a path to save out the alignment results
    output_path = os.path.join('/content/alignments', f'{seq_id}.sam')

    # Run the Bowtie alignment
    result = subprocess.run([
        'bowtie',
        '-x', genome_index_path,
        '-f', card_genes_abspath,
        '-S', output_path,
        '-v', '1'
    ], capture_output=True, text=True)
print(result.stdout)
print(result.stderr)





### 3. Process the Alignment files to Extract the Genomes & Metadata

- Bowtie alignment produces a set of SAM files
- BAM/SAM (BAM = binary SAM) files are a common data format in bioinformatics and contain a lot of useful data
- Here you'll extract the sequence information for genes matches and save into a DataFrame

In [32]:
# Lets take a look at a single BAM file for reference
fasta_0 = all_genome_fastas[0]
fasta_0_sam_path = '/content/alignments/' + fasta_0.replace('.fna','') + '.sam'
sam_abspath = os.path.abspath(fasta_0_sam_path)
samfile = pysam.AlignmentFile(sam_abspath, "rb")

samfile_contents = samfile.fetch()

# This will loop through the first SAMFILE and stop once the first match is found
for read in samfile_contents:
    match = read.get_blocks()
    if len(match) > 0:
        break

# Take a look at the first match:
print('\nReference Sample Name (Assembly Genome):   ', read.reference_name)
print('\nCARD Gene name:  ', read.query_name)
print('\nMatch Quality:  ', read.cigarstring)
print('\nMatch Location:  ', match[0][0], 'to', match[0][1])
print('\nGenomic Sequence for Gene:  ', read.query_sequence)
print('\nGenomic Sequence for Assembly:  ', read.get_reference_sequence())


Reference Sample Name (Assembly Genome):    CYGC01000002

CARD Gene name:   gb|U00096.3|+|891183-891906|ARO:3003751|Ecol_nfsA_NIT

Match Quality:   723M

Match Location:   143807 to 144530

Genomic Sequence for Gene:   TTAGCGCGTCGCCCAACCCTGTTTGTGCAAATAATCCAGAATAAATGGGCGGCTTTCTTTAATGATTGTTCGGCGGATATGATCGCTCCAGGTATCCCGGCGATTATTGCTGCCACGGGTGAGGTAATATTCCGCCAGTTGCTCGTCATACTGCGCCAGTGCGCCTTTATCCAGCGGTTGATAGCTGTTTTCATGCACCAAAATGGAGGCCGGTAAACGCGGCTTAAGATCCGGATTATCCGCAGGCCAGCCAAGGCACAGCCCAAACAGCGGCAGAACATGCTGCGGTAATTTAAGCAGTTTCGTCACCGCTTCAATATTATTGCGCAGGCCGCCGATATATACCCCGCCCAATCCCAGCGATTCCGCTGCGATTAATGCATTCTGCGCCATCATTGCCGTATCAACGACACCGAGCAACAGTTGTTCCGCCAGGCCGAGCTGAGCATCCGGACAGATCTGTAAATGGCGGTTAAAGTCGGCACAGAACACCCAGAACTCCGCCGCTTGCGCTACGTGTTTTTGCCCGCCGGTCAGCGTCACCAGTTCTTCACGTAACGCTTTGTCGGTAATGCGAATAATGCTACTGCACTGCAAAAAACTGGAACTGGACGTCGCACGGGCGCTGTTAATAATCGCCTCACGCTGCGCTTCGGAAATGGGTTCATCAGTGAAATGGCGAATGGAGCGATGGCCACAAATAAGTTCAATGGTTGGCGTCAT

Genomic Sequence for Assembly:   TTAGCGCGTCGCCCAACCCTGT

##### Summary of the Above
- The SAM file contains a record for each gene (CARD) you tried to align to the reference genome (assembly)
- Most of these are empty as the gene was not found in the assembly
- Above is the first gene that was found in the assembly
- You can see the name of the sample we're alinging against, the name of the resistance gene, the quality of the match, where in the assembly genome the gene was found (nucleotide position) and finally the sequences themselves
- If you look at the assembly sequence carefully you'll see lower case letters, these are places where the reference genome differs from the gene we were searching for but it was close enough to be considered a match

#### Now it's your turn!

In [40]:
# Loop through your 5 sample genomes
for fasta in all_genome_fastas[0:5]:
    seq_id = fasta.replace('.fna', '')

    # Load alignment statistics using SAM
    fasta_sam_path = '/content/alignments/' + seq_id + '.sam'
    sam_abspath = os.path.abspath(fasta_sam_path)

    # Use a try-except block to handle the OSError
    try:
        samfile = pysam.AlignmentFile(sam_abspath, "rb")

        # Iterate through all the potential match genes in the alignment output: if match then append to the res_genes dictionary
        res_genes = {
            'ref_name':[],
            'contig':[],
            'res_gene':[],
            'match_start':[],
            'match_end':[],
            'match_qual':[],
            'query_str':[],
            'ref_gene_str':[]
        }
        for read in samfile.fetch():
                match = read.get_blocks()
                if len(match) > 0:
                    # When a match is found append the relevant information into the res_genes dictionary
                   res_genes['ref_name'].append(read.reference_name)
                   res_genes['contig'].append(read.query_name)
                   res_genes['res_gene'].append(seq_id)
                   res_genes['match_start'].append(match[0][0])
                   res_genes['match_end'].append(match[-1][1])
                   res_genes['match_qual'].append(read.mapping_quality)
                   res_genes['query_str'].append(read.query_sequence)
                   res_genes['ref_gene_str'].append(read.get_reference_sequence())

        # Generate a final dataframe from the res_genes dictionary and display
        output_dataframe = pd.DataFrame(res_genes)
        display(output_dataframe)

    except OSError as e:
        print(f"Error reading file {sam_abspath}: {e}")
        # Handle the error, e.g., skip the file or try to repair it

Unnamed: 0,ref_name,contig,res_gene,match_start,match_end,match_qual,query_str,ref_gene_str
0,CYGC01000002,gb|U00096.3|+|891183-891906|ARO:3003751|Ecol_n...,562.7619,143807,144530,255,TTAGCGCGTCGCCCAACCCTGTTTGTGCAAATAATCCAGAATAAAT...,TTAGCGCGTCGCCCAACCCTGTTTGTGCAAATAATCCAGAATAAAT...
1,CYGC01000002,gb|U00096.3|-|985893-986982|ARO:3003390|Ecol_o...,562.7619,48563,49652,255,ATGATGAAGCGCAATATTCTGGCAGTGATCGTCCCTGCTCTGTTAG...,ATGATGAAGCGCAATATTCTGGCAGTGATCGTCCCTGCTCTGTTAG...
2,CYGC01000021,gb|U00096.1|+|4277468-4277933|ARO:3003381|Ecol...,562.7619,17391,17856,255,ATGGAAAAGAAATTACCCCGCATTAAAGCGCTGCTAACCCCCGGCG...,ATGGAAAAGAAATTACCCCGCATTAAAGCGCTGCTAACCCCCGGCG...
3,CYGC01000007,gb|U00096.3|+|2810769-2811300|ARO:3000516|emrR,562.7619,34881,35412,255,ATGGATAGTTCGTTTACGCCCATTGAACAAATGCTAAAATTTCGCG...,ATGGATAGTTCGTTTACGCCCATTGAACAAATGCTAAAATTTCGCG...


Unnamed: 0,ref_name,contig,res_gene,match_start,match_end,match_qual,query_str,ref_gene_str
0,CYBB01000006,gb|U00096.3|+|891183-891906|ARO:3003751|Ecol_n...,562.7623,201640,202363,255,TTAGCGCGTCGCCCAACCCTGTTTGTGCAAATAATCCAGAATAAAT...,TTAGCGCGTCGCCCAACCCTGTTTGTGCAAATAATCCAGAATAAAT...
1,CYBB01000006,gb|U00096.3|-|985893-986982|ARO:3003390|Ecol_o...,562.7623,106511,107600,255,ATGATGAAGCGCAATATTCTGGCAGTGATCGTCCCTGCTCTGTTAG...,ATGATGAAGCGCAATATTCTGGCAGTGATCGTCCCTGCTCTGTTAG...
2,CYBB01000009,gb|U00096.1|+|4277468-4277933|ARO:3003381|Ecol...,562.7623,81554,82019,255,TTAGTTTTGTTCATCTTCCAGCAAGCGTGCGCCGGTACCTTCTTCT...,TTAGTTTTGTTCATCTTCCAGCAAGCGTGCGCCGGTACCTTCTTCT...
3,CYBB01000009,gb|U00096.3|-|4277059-4277383|ARO:3003511|Ecol...,562.7623,82104,82428,255,ATGTCCCATCAGAAAATTATTCAGGATCTTATCGCATGGATTGACG...,ATGTCCCATCAGAAAATTATTCAGGATCTTATCGCATGGATTGACG...
4,CYBB01000002,gb|U00096.3|+|2810769-2811300|ARO:3000516|emrR,562.7623,327982,328513,255,TTAGCTCATCGCTTCGAGAACCACACCGTCTTGTTCCATCTGGTCG...,TTAGCTCATCGCTTCGAGAACCACACCGTCTTGTTCCATCTGGTCG...


Error reading file /content/alignments/562.7624.sam: truncated file


Unnamed: 0,ref_name,contig,res_gene,match_start,match_end,match_qual,query_str,ref_gene_str
0,CYBO01000005,gb|U00096.3|+|1150669-1151404|ARO:3004049|Ecol...,562.7581,16168,16903,255,TCAGACCATGTACATCCCGCCGTTCACATGCAAAGTTTCACCCGTG...,TCAGACCATGTACATCCCGCCGTTCACATGCAAAGTTTCACCCGTG...
1,CYBO01000063,gb|KC292503.1|+|4334-5195|ARO:3000904|TEM-34,562.7581,5818,6679,255,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...
2,CYBO01000063,gb|AB700703.1|+|161-1022|ARO:3001057|TEM-198,562.7581,5818,6679,255,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...
3,CYBO01000063,gb|KC783461.1|+|0-861|ARO:3001383|TEM-206,562.7581,5818,6679,255,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...
4,CYBO01000063,gb|KC818234.1|+|0-861|ARO:3001384|TEM-207,562.7581,5818,6679,255,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...
5,CYBO01000063,gb|GU550123.1|+|144-1005|ARO:3001041|TEM-176,562.7581,5818,6679,255,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...
6,CYBO01000063,gb|AM087454.1|+|208-1069|ARO:3001015|TEM-148,562.7581,5818,6679,255,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...
7,CYBO01000063,gb|FJ197316.1|+|0-861|ARO:3001032|TEM-166,562.7581,5818,6679,255,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...
8,CYBO01000063,gb|AL513383.1|+|161910-162771|ARO:3000873|TEM-1,562.7581,5818,6679,255,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...
9,CYBO01000063,gb|KP050491.1|+|0-861|ARO:3001391|TEM-214,562.7581,5818,6679,255,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...,TTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTA...


Error reading file /content/alignments/562.7625.sam: truncated file
