# Lab 1

In [1]:
# Imports (should be alphabetized)

# Characterizing the number of nucleotides (non-standard import name)
import Bio

# File I/O
import os

# Regular expressions
import re

# Getting a sequence from GenBank (Ensembl)
import requests

In [2]:




# --- !!! IMPORTANT !!! --- #




# Set the working directory.

# Path must be absolute, cannot use '~/analyses'.
os.chdir('/home/aeros/analyses')





## Part A

In [3]:
# Get the current working directory.
os.getcwd()

'/home/aeros/analyses'

In [4]:
# See what is in the current working directory.
os.listdir()

['blastdbs',
 'lab_1',
 'lecture_3',
 'lab_4',
 'lab_2',
 'lab_5',
 'git_repos',
 'genomes']

In [5]:
# Make a directory for the lab.

# Note the simple naming convention - all lowercase, underscores
os.mkdir('lab_1')

FileExistsError: [Errno 17] File exists: 'lab_1'

In [6]:
# Rename the lab directory.
os.rename('lab_1', 'lab_1_delete_me')

In [7]:
# Change to the new lab directory.
os.chdir('./lab_1_delete_me')

In [8]:
# Attempt to remove the new lab directory.
os.rmdir('./lab_1_delete_me')

FileNotFoundError: [Errno 2] No such file or directory: './lab_1_delete_me'

## Part B

In [11]:
# We will use Ensembl to retrieve the sequence for our
# given gene.

# See https://rest.ensembl.org/documentation/info/sequence_id
# for more details about the API call that we will make.

# Source: https://pypi.org/project/requests/
# Source: https://rest.ensembl.org/documentation/info/sequence_id#python3basic

In [13]:




# --- SOLUTION 1 (Procedural, NOT abstracted) --- #




# Make the GET request, sending headers indicating
# that we are sending plain text.
r = requests.get(
    'https://rest.ensembl.org/sequence/id/ENSG00000157764', 
    headers = {
        'Content-Type' : 'text/plain'
    }
)

# Did the response come back correctly?
if not r.ok:
    
    r.raise_for_status()
    print('Request to Ensembl failed...')
    
else:
    
    # What kind of headers did we get back?
    
    # Note that requests should cast the response
    # to a certain type based on the content-type
    # header, i.e. a response with acontent-type 
    # of "text/plain" should be cast to a string.
    print(r.headers['content-type'])
    
    # What kind of encoding is the response?
    print(r.encoding)
    
    # Store the response as our sequence.
    
    # Good practice to strip out all newlines in
    # the response.
    sequence = r.text.strip()
        
    # Now create a dictionary to hold the position of
    # all the subsequences that we are searching for.
    subsequences = {
        'GC': [],
        'CG': [],
        'AT': []
    }
    
    # Iterate over the gene sequence and find the
    # subsequences.
    for m in re.finditer('GC|CG|AT', sequence):
        
        # For each match, store the start and end
        # indices as a tuple.
        
        # Use m.group() to get the actual regex
        # match.
        subsequences[m.group()].append((m.start(), m.end()))
    
    # (Optional) Print the matches.
    # print(subsequences)
    
    # Print the % of each subsequence.
    
    # f-strings are nice, see https://stackoverflow.com/a/28343785
    # and https://stackoverflow.com/a/59591771
    
    # Show precision to 9 places as the human genome is on the order
    # of billions of base pairs.
    [print(f'%{k} in sequence: {(len(subsequences[k])/len(sequence))*100: .9f}%') for k in subsequences.keys()]

text/plain; charset=UTF-8
UTF-8
%GC in sequence:  3.547613605%
%CG in sequence:  0.644445850%
%AT in sequence:  8.190055593%


In [14]:




# --- SOLUTION 2 (OOP, abstracted) --- #




# Q: How could we eliminate waiting for the server request?

# Define the class.
class substringContent():
    
    
    """
    Simple class for calculating percentage of a sequence
    that consists of given subsequences.
    """
    
    
    # Initialize with the state that we do not have
    # a sequence to search.
    def __init__(
        self
    ):
        
        sequence = False
    
    
    # Get subsequence positions
    def get_subsequence_positions(
        self,
        sbsqncs
    ):
        
        
        """
        Get the positions of given subsequences, 
        then set the attribute.
        """
        
        # sbsqncs (list of str) - The subsequences to look for
        # in a given sequence
        
        # Note: no strong error checking here to enforce type
        # on subsequences...
        
        
        # Create a dictionary to hold the position of
        # all the subsequences that we are searching for.
        subsequences = {}
        
        for subseq in sbsqncs:
            subsequences[subseq] = []
        
        # Iterate over the gene sequence and find the
        # subsequences.
        for m in re.finditer('|'.join(subsequences.keys()), self.sequence):

            # For each match, store the start and end
            # indices as a tuple.

            # Use m.group() to get the actual regex
            # match.
            subsequences[m.group()].append((m.start(), m.end()))
        
        # Create the attribute.
        self.subsequences = subsequences
        
        
    # Making a request to Ensembl
    def request_to_ensembl(
        self,
        idntfr
    ):
        
        
        """
        Given an identifier, ask Ensembl for the sequence,
        if it exists.
        """
        
        # idntfr (str) - Ensemble ID
        
        
        # Make the GET request, sending headers indicating
        # that we are sending plain text.
        r = requests.get(
            'https://rest.ensembl.org/sequence/id/' + idntfr, 
            headers = {
                'Content-Type' : 'text/plain'
            }
        )
        
        # Note that it is better to not break the block below into 
        # its own method because typically server responses are
        # handled right away in logic to help identify faulty,
        # useless, or otherwise problematic responses.
        
        # Only set the attribute if the request was successful.
        if r.ok:
            
            # What kind of headers did we get back?
    
            # Note that requests should cast the response
            # to a certain type based on the content-type
            # header, i.e. a response with acontent-type 
            # of "text/plain" should be cast to a string.
            print(r.headers['content-type'])

            # What kind of encoding is the response?
            print(r.encoding)

            # Store the response as our sequence.

            # Good practice to strip out all newlines in
            # the response.
            self.sequence = r.text.strip()
    
    
    # Print the % of each subsequence.
    def print_percentage_subsequence(
        self
    ):
        
        
        """
        Take the subsequences and see what percentage of
        the overall sequence they comprise.
        """
        
        
        # f-strings are nice, see https://stackoverflow.com/a/28343785
        # and https://stackoverflow.com/a/59591771

        # Show precision to 9 places as the human genome is on the order
        # of billions of base pairs.
        [print(f'%{k} in sequence: {(len(self.subsequences[k])/len(self.sequence))*100: .9f}%') for k in self.subsequences.keys()]


# Instantiate.
sc = substringContent()

# Make the request to Ensembl.
sc.request_to_ensembl(
    idntfr = 'ENSG00000157764'
)

# (Optional) See what we got.
# print(sC.sequence)

# Get the positions of the subsequences
# of interest.
sc.get_subsequence_positions(
    sbsqncs = ['GC', 'CG', 'AT']
)

# (Optional) Did we get anything from that?
# print(sc.subsequences)

# See what the percentages are.
sc.print_percentage_subsequence()

text/plain; charset=UTF-8
UTF-8
%GC in sequence:  3.547613605%
%CG in sequence:  0.644445850%
%AT in sequence:  8.190055593%
