# BF550 Lab 2 - Working with a `.fasta` file

Fall 2022

In this lab, you will work with a FASTA file of the *E. coli* genome. A fasta file contains a metadata header as the first line and then a nucleotide sequence split over the subsequent lines. Here, you will ultimately create a Python class that will be able to read this file and report some basic statistics.

# 1. Read a fasta file

In [60]:
def readFasta(filepath):
    """
    Reads a FASTA file from `filepath` and returns the nucleotide sequence as a single string
    
    Input:
    | filepath <str>: File path to FASTA file
    
    Output:
    | seq <str>: Sequence from `filepath`
    """
    seq = ''
    
    # WRITE YOUR CODE FOR READING IN THE FASTA FILE HERE
    seq = ''
    with open(filepath) as fh:
        for line in fh.readlines():
            # Only keep lines that start with A, T, G, or C (to handle the "end" at the last row)
            if any([line.startswith(x) for x in 'ATGC']):
                line = line.strip()
                seq += line
    
    return seq

In [71]:
"Use `readFasta` to read in the E. coli genome"
# Read in fasta file
filepath = 'ecoli_genome.fasta'
ecoli_genome = readFasta(filepath)

# DISPLAY THE FIRST 10 NUCLEOTIDES OF THE GENOME
ecoli_genome[:10]

'AGCTTTTCAT'

# 2. Nucleotide frequency

In [62]:
def nucleotideFrequency(seq):
    """
    Compute the nucleotide frequency of `seq`
    
    Input:
    | seq <str>: A nucleotide sequence containing A, T, G, and C
    
    Output:
    | counts <dict>: Counts of each nucleotide
    """
    counts = {'A': 0, 'T': 0, 'G': 0, 'C': 0}
    
    # WRITE YOUR CODE FOR UPDATING the `counts` dictionary here
    # YOU SHOULD BE ABLE TO WRITE THIS WITHOUT ANY `if` statements (use dictionary indexing)
    
    for s in seq:
        counts[s] += 1
    
    return counts

In [63]:
"Compute the nucleotide frequency of the ecoli genome"
nuc_freq = nucleotideFrequency(ecoli_genome)

# DISPLAY THE NUCLEOTIDE FREQUENCY OF THE GENOME
nuc_freq

{'A': 1142742, 'T': 1141382, 'G': 1177437, 'C': 1180091}

# 3. GC content

In [19]:
def gcContent(nucleotide_frequency):
    """
    Compute the GC content given nucleotide frequencies
    
    The GC content is the fraction of `seq` that is either G or C
    
    Input:
    | nucleotide_frequencies <dict>: Frequency of each nucleotide
    
    Output:
    | gc <float>: The GC content of `seq`
    """
    # Compute GC content
    gc = (nuc_freq['G'] + nuc_freq['C']) / sum(nuc_freq.values())

    return gc

In [20]:
"Calculate the GC content of the genome"
gc = gcContent(nuc_freq)

# DISPLAY THE GC CONTENT OF THE GENOME
gc

0.5079070985933456

# 4. Dinucleotide content

In [30]:
def dnFrequency(seq):
    """
    Compute the dinucleotide frequency of `seq`
    
    Input:
    | seq <str>: A nucleotide sequence containing A, T, G, and C
    
    Output:
    | dinuc_freq <dict>: Counts of each dinucleotide {'AA': <int>, 'AT': <int>, ...}

    # Example
    seq = 'AATC'

    dinuq_freq = {'AA': 1, 'AT': 1, 'TC': 1, (rest zeros)}
    """
    # INITIALIZE A DICTIONARY WITH ALL DINUCLEOTIDE PAIRS AND SET EACH VALUE TO ZERO
    nucs = 'ATGC'
    # Initialize 
    dinuc_freq = {x+y:0 for x in nucs for y in nucs}
    
    # Count each dinucleotide
    for i in range(len(seq) - 1):
        dinuc = seq[i: i+2]
        dinuc_freq[dinuc] += 1
    
    return dinuc_freq

In [31]:
"Compute the dinucleotide frequency of the genome"
dinuc_freq = dnFrequency(ecoli_genome)
dinuc_freq

{'AA': 338006,
 'AT': 309950,
 'AG': 238013,
 'AC': 256773,
 'TA': 212024,
 'TT': 339584,
 'TG': 322379,
 'TC': 267395,
 'GA': 267384,
 'GT': 255699,
 'GG': 270252,
 'GC': 384102,
 'CA': 325327,
 'CT': 236149,
 'CG': 346793,
 'CC': 271821}

# 5. A `Genome` class
Create a class that contains all of the above functions

In [74]:
class Genome:
    """
    Read, store, and analyze the contents of a genome from a FASTA file
    """
    def __init__(self, filepath):
        self.filepath = filepath
        self.genome = None
        self.nucleotide_frequency = None
        self.gc_content = None
        self.dinucleotide_frequency = None
    
    # UPDATE THE FOLLOWING FUNCTIONS TO POPULATE THE ABOVE PROPERTIES
    def readFasta(self):
        """
        Reads a FASTA file from `filepath` and returns the nucleotide sequence as a single string

        Input:
        | filepath <str>: File path to FASTA file

        Output:
        | seq <str>: Sequence from `filepath`
        """
        filepath = self.filepath
        seq = ''

        # WRITE YOUR CODE FOR READING IN THE FASTA FILE HERE
        seq = ''
        with open(filepath) as fh:
            for line in fh.readlines():
                if any([line.startswith(x) for x in 'ATGC']):
                    line = line.strip()
                    seq += line
    
        self.genome = seq

    def nucleotideFrequency(self):
        """
        Compute the nucleotide frequency of `seq`

        Input:
        | seq <str>: A nucleotide sequence containing A, T, G, and C

        Output:
        | counts <dict>: Counts of each nucleotide
        """
        counts = {'A': 0, 'T': 0, 'G': 0, 'C': 0}
        for s in seq:
            counts[s] += 1
        self.nucleotide_frequency = counts

    def gcContent(self):
        """
        Compute the GC content given nucleotide frequencies

        The GC content is the fraction of `seq` that is either G or C

        Input:
        | nucleotide_frequencies <dict>: Frequency of each nucleotide

        Output:
        | gc <float>: The GC content of `seq`
        """
        # Compute GC content
        gc = (nuc_freq['G'] + nuc_freq['C']) / sum(nuc_freq.values())

        self.gc_content = gc
    
    def dnFrequency(self):
        """
        Compute the dinucleotide frequency of `seq`

        Input:
        | seq <str>: A nucleotide sequence containing A, T, G, and C

        Output:
        | dinuc_freq <dict>: Counts of each dinucleotide {'AA': <int>, 'AT': <int>, ...}

        # Example
        seq = 'AATC'

        dinuq_freq = {'AA': 1, 'AT': 1, 'TC': 1, (rest zeros)}
        """
        # INITIALIZE A DICTIONARY WITH ALL DINUCLEOTIDE PAIRS AND SET EACH VALUE TO ZERO
        nucs = 'ATGC'
        # Initialize 
        dinuc_freq = {x+y:0 for x in nucs for y in nucs}

        # Count each dinucleotide
        seq = self.genome
        for i in range(len(seq) - 1):
            dinuc = seq[i: i+2]
            dinuc_freq[dinuc] += 1

        self.dinucleotide_frequency = dinuc_freq

In [75]:
"Using the `Genome` class, print the nucleotide frequency, gc content and dinucleotide frequency"
# Create `Genome` object
ecoli = Genome(filepath)

ecoli.readFasta()

# PRINT nuc frequency with object
ecoli.nucleotideFrequency()
print(ecoli.nucleotide_frequency)

# PRINT gc content with object
ecoli.gcContent()
print(ecoli.gc_content)

# PRINT dinuc frequency with object
ecoli.dnFrequency()
print(ecoli.dinucleotide_frequency)


{'A': 1, 'T': 2, 'G': 0, 'C': 1}
0.5079070985933456
{'AA': 338006, 'AT': 309950, 'AG': 238013, 'AC': 256773, 'TA': 212024, 'TT': 339584, 'TG': 322379, 'TC': 267395, 'GA': 267384, 'GT': 255699, 'GG': 270252, 'GC': 384102, 'CA': 325327, 'CT': 236149, 'CG': 346793, 'CC': 271821}


# 6. Challenge: k-mer frequency
Write a function to count not just the dinucleotide frequency but the k-mer (k-nucleotide) frequency for any `k` 1-5. Plot the distribution of each k-mer.

# 7. Challenge: Translate this DNA sequence into an amino acid sequence
