## Import statements

In [1]:
pip install ensembl_rest

Collecting ensembl_rest
  Downloading ensembl_rest-0.3.4-py3-none-any.whl (30 kB)
Collecting simplejson (from ensembl_rest)
  Downloading simplejson-3.19.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.9/137.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting intervaltree (from ensembl_rest)
  Downloading intervaltree-3.1.0.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: intervaltree
  Building wheel for intervaltree (setup.py) ... [?25l[?25hdone
  Created wheel for intervaltree: filename=intervaltree-3.1.0-py2.py3-none-any.whl size=26095 sha256=a7d3cd13dcb1aeab629ab2f8d65fc4c39c087247a4a3fe654e104c5a737e7064
  Stored in directory: /root/.cache/pip/wheels/fa/80/8c/43488a924a046b733b64de3fac99252674c892a4c3801c0a61
Successfully built intervaltree
Installing collected packages: simplejso

## Functions

In [4]:
import ensembl_rest
import requests
import csv
import re
import sys
import os


def get_cds(transcript_id):
    """
    Retrieves the coding sequence (CDS) for a given Ensembl transcript ID.

    Args:
        transcript_id (str): Ensembl transcript ID for the target gene.

    Returns:
        str: The nucleotide sequence of the coding sequence (CDS).
    """
    # Construct the REST API URL for retrieving CDS
    address = f"https://rest.ensembl.org/sequence/id/{transcript_id}?multiple_sequences=1;type=cds"

    # Make a GET request to the Ensembl REST API
    r = requests.get(address, headers={"Content-Type": "text/x-fasta"})

    # Ensure that there are no issues with the sequence request
    if not r.ok:
        r.raise_for_status()
        sys.exit()

    # Extract only the nucleotide sequence and format into a single string
    raw_output = r.text
    pattern = re.compile('(?:^|\n)[ATGC]+')
    matches = pattern.findall(raw_output)
    cds_sequence = ''.join(matches).replace('\n', '')

    return cds_sequence

def get_promoter_terminator(transcript_id, promoter_length=1000, terminator_length=500):
    """
    Retrieves the promoter and terminator sequences for a given Ensembl transcript ID.

    Args:
        transcript_id (str): Ensembl transcript ID for the target gene.
        promoter_length (int, optional): Length of the promoter sequence (default is 1000).
        terminator_length (int, optional): Length of the terminator sequence (default is 500).

    Returns:
        tuple: A tuple containing the promoter and terminator sequences as strings.
    """
    # Use Ensembl REST API to retrieve cDNA sequence with specified 5' and 3' expansions
    sequence = ensembl_rest.sequence_id(id=transcript_id, type="cdna", expand_5prime=promoter_length, expand_3prime=terminator_length)["seq"]

    # Extract promoter and terminator sequences from the cDNA sequence
    promoter_sequence = sequence[:promoter_length]
    terminator_sequence = sequence[-terminator_length:]

    return promoter_sequence, terminator_sequence

def extract_utr_information(data):
    """
    Extracts information about 5' and 3' UTRs (Untranslated Regions) from the provided data.

    Args:
        data (dict): A dictionary containing information about UTRs.

    Returns:
        tuple: A tuple containing lists of 5' UTRs, 3' UTRs, chromosome, and strand information.
    """
    # Retrieve UTR data from the input dictionary
    utr_data = data.get('UTR', [])

    # Initialize lists to store 5' UTR and 3' UTR information
    utr5_list = []
    utr3_list = []

    # Iterate through UTR entries in the data
    for utr_entry in utr_data:
        utr_type = utr_entry.get('type', '')
        utr_start = utr_entry.get('start', None)
        utr_end = utr_entry.get('end', None)

        # Check if the UTR entry has valid type, start, and end information
        if utr_type and utr_start is not None and utr_end is not None:
            # Categorize UTRs into 5' and 3' UTR lists
            if utr_type == 'five_prime_utr':
                utr5_list.append((utr_start, utr_end))
            elif utr_type == 'three_prime_utr':
                utr3_list.append((utr_start, utr_end))

    # Extract chromosome and strand information
    chromosome = utr_entry['seq_region_name'] if utr_data else None
    strand = data.get('strand', None)

    return utr5_list, utr3_list, chromosome, strand


def get_utr_sequence(chromosome, strand, start, end, species):
    """
    Retrieves the nucleotide sequence of a UTR (Untranslated Region) from the Ensembl database.

    Args:
        chromosome (str): Chromosome name or identifier.
        strand (int): Strand information (1 for forward strand, -1 for reverse strand).
        start (int): Start position of the UTR on the chromosome.
        end (int): End position of the UTR on the chromosome.
        species (str, optional): Species for which the UTR sequence is requested.

    Returns:
        str: The nucleotide sequence of the specified UTR.
    """
    # Use Ensembl REST API to retrieve UTR sequence for the specified region
    region = f"{chromosome}:{start}..{end}:{strand}"
    utr_sequence = ensembl_rest.sequence_region(region=region, species=species)["seq"]

    return utr_sequence

def get_full_utr_sequence(list_utr_coordinates, chromosome, strand, species):
    """
    Retrieves the concatenated nucleotide sequence of multiple UTRs from the Ensembl database.

    Args:
        list_utr_coordinates (list): A list of tuples representing UTR start and end coordinates.
        chromosome (str): Chromosome name or identifier.
        strand (int): Strand information (1 for forward strand, -1 for reverse strand).

    Returns:
        str: The concatenated nucleotide sequence of the specified UTRs.
    """
    # Initialize an empty string to store the concatenated UTR sequence
    concatenated_sequence = ""

    # Iterate through UTR coordinates and retrieve individual UTR sequences
    for start, end in list_utr_coordinates:
        sequence = get_utr_sequence(chromosome, strand, start, end, species)
        concatenated_sequence += sequence

    return concatenated_sequence




## Coding sequence

Those 2 outputs don't match... Which one is correct?

In [6]:
get_cds("ENST00000373020")

'ATGGCGTCCCCGTCTCGGAGACTGCAGACTAAACCAGTCATTACTTGTTTCAAGAGCGTTCTGCTAATCTACACTTTTATTTTCTGGATCACTGGCGTTATCCTTCTTGCAGTTGGCATTTGGGGCAAGGTGAGCCTGGAGAATTACTTTTCTCTTTTAAATGAGAAGGCCACCAATGTCCCCTTCGTGCTCATTGCTACTGGTACCGTCATTATTCTTTTGGGCACCTTTGGTTGTTTTGCTACCTGCCGAGCTTCTGCATGGATGCTAAAACTGTATGCAATGTTTCTGACTCTCGTTTTTTTGGTCGAACTGGTCGCTGCCATCGTAGGATTTGTTTTCAGACATGAGATTAAGAACAGCTTTAAGAATAATTATGAGAAGGCTTTGAAGCAGTATAACTCTACAGGAGATTATAGAAGCCATGCAGTAGACAAGATCCAAAATACGTTGCATTGTTGTGGTGTCACCGATTATAGAGATTGGACAGATACTAATTATTACTCAGAAAAAGGATTTCCTAAGAGTTGCTGTAAACTTGAAGATTGTACTCCACAGAGAGATGCAGACAAAGTAAACAATGAAGGTTGTTTTATAAAGGTGATGACCATTATAGAGTCAGAAATGGGAGTCGTTGCAGGAATTTCCTTTGGAGTTGCTTGCTTCCAACTGATTGGAATCTTTCTCGCCTACTGCCTCTCTCGTGCCATAACAAATAACCAGTATGAGATAGTGTAA'

In [7]:
ensembl_rest.sequence_id(id="ENST00000373020", type="cds")["seq"]

'AGTTGTGGACGCTCGTAAGTTTTCGGCAGTTTCCGGGGAGACTCGGGGACTCCGCGTCTCGCTCTCTGTGTTCCAATCGCCCGGTGCGGTGGTGCAGGGTCTCGGGCTAGTCATGGCGTCCCCGTCTCGGAGACTGCAGACTAAACCAGTCATTACTTGTTTCAAGAGCGTTCTGCTAATCTACACTTTTATTTTCTGGGTGAGAGACGAAGGCGCCTGGGGCCGGCAGGGGATCCCGGGCTTTTAGTTGTGGGGGGTGTGACCCTGAGCGGCGGGAGCTCAGGTCGGGAACGGTGTGGGGTTTGGGCGGCCATCGCGCCTGGGACCCCGACGCCGGCGACCAGTGACTGGGCCCCGAACAAAGAGCTCAGGCATCTCGCCGGCGCTGGGGTCGGAGTGCGGGACGAAGGCAGCCGAGAGGCTGAGCTGGGACCCAGACGTACCAGCGTTCGAGTTCGATACGGGAGGCGAGGTGGGGCCTTAAGCCCCCAGAGCACGCAGACGGTGACCCTGGACCTTTGCAAAGGCAAGCAAGGCTTGCCCGGCCCCTTTGAGGGCTCACTTGCATAATAGTGTCTTTCTTTTCACTGCTTCGTAAGAGAGGAGTGTTTCGCAGGTAAGTATGGTATGCCTCCACTCCCGCCAGTCTATCCTTGGGCTTGCTTTTGTGTACCAGTCCTTTGTACCCTGTGCCCATCCCTACCTGGAGAGGACGGGGGAAGTTTCTGGAACTCATAAAATCACGTTTTTTTTTTCCCCTTCAAGGTTTTCTTCCTAAGGTTGGAGTAAGATAGTTAAGGAAGTTTTGATCCCTAGGGCAAATATGCCATGGGCTTGTGTTTACACACAAAAAGATGAATTTTAAGAATTTTGATAAGATACCTGGTCCACAGCCTGCTCTTGTGTGCAGATCCCCCCCTAACCCCTTCTACTCTTCCCCCTTGTCTGCCGTATCCCAGCATATTACACCTATTGTGCCTTAAATACCTCATGCGTTGT

## Promoter & Terminator

In [5]:
promoter, terminator = get_promoter_terminator("ENST00000373020")
print(promoter)
print(terminator)

AGTTGTGGACGCTCGTAAGTTTTCGGCAGTTTCCGGGGAGACTCGGGGACTCCGCGTCTCGCTCTCTGTGTTCCAATCGCCCGGTGCGGTGGTGCAGGGTCTCGGGCTAGTCATGGCGTCCCCGTCTCGGAGACTGCAGACTAAACCAGTCATTACTTGTTTCAAGAGCGTTCTGCTAATCTACACTTTTATTTTCTGGGTGAGAGACGAAGGCGCCTGGGGCCGGCAGGGGATCCCGGGCTTTTAGTTGTGGGGGGTGTGACCCTGAGCGGCGGGAGCTCAGGTCGGGAACGGTGTGGGGTTTGGGCGGCCATCGCGCCTGGGACCCCGACGCCGGCGACCAGTGACTGGGCCCCGAACAAAGAGCTCAGGCATCTCGCCGGCGCTGGGGTCGGAGTGCGGGACGAAGGCAGCCGAGAGGCTGAGCTGGGACCCAGACGTACCAGCGTTCGAGTTCGATACGGGAGGCGAGGTGGGGCCTTAAGCCCCCAGAGCACGCAGACGGTGACCCTGGACCTTTGCAAAGGCAAGCAAGGCTTGCCCGGCCCCTTTGAGGGCTCACTTGCATAATAGTGTCTTTCTTTTCACTGCTTCGTAAGAGAGGAGTGTTTCGCAGGTAAGTATGGTATGCCTCCACTCCCGCCAGTCTATCCTTGGGCTTGCTTTTGTGTACCAGTCCTTTGTACCCTGTGCCCATCCCTACCTGGAGAGGACGGGGGAAGTTTCTGGAACTCATAAAATCACGTTTTTTTTTTCCCCTTCAAGGTTTTCTTCCTAAGGTTGGAGTAAGATAGTTAAGGAAGTTTTGATCCCTAGGGCAAATATGCCATGGGCTTGTGTTTACACACAAAAAGATGAATTTTAAGAATTTTGATAAGATACCTGGTCCACAGCCTGCTCTTGTGTGCAGATCCCCCCCTAACCCCTTCTACTCTTCCCCCTTGTCTGCCGTATCCCAGCATATTACACCTATTGTGCCTTAAATACCTCATGCGTTGTG

## UTRs

In [9]:
transcript_data = ensembl_rest.lookup(id="ENST00000373020", params={'expand':True,'utr':True})
utr5_coord_list, utr3_coord_list, chromosome, strand = extract_utr_information(transcript_data)
utr5_sequence = get_full_utr_sequence(utr5_coord_list, chromosome, strand, species="homo_sapiens")
utr3_sequence = get_full_utr_sequence(utr3_coord_list, chromosome, strand, species="homo_sapiens")

In [10]:
print(utr5_sequence)

AGTTGTGGACGCTCGTAAGTTTTCGGCAGTTTCCGGGGAGACTCGGGGACTCCGCGTCTCGCTCTCTGTGTTCCAATCGCCCGGTGCGGTGGTGCAGGGTCTCGGGCTAGTC


In [11]:
print(utr3_sequence)

CCCAATGTATCTGTGGGCCTATTCCTCTCTACCTTTAAGGACATTTAGGGTCCCCCCTGTGAATTAGAAAGTTGCTTGGCTGGAGAACTGACAACACTACTTACTGATAGACCAAAAAACTACACCAGTAGGTTGATTCAATCAAGATGTATGTAGACCTAAAACTACACCAATAGGCTGATTCAATCAAGATCCGTGCTCGCAGTGGGCTGATTCAATCAAGATGTATGTTTGCTATGTTCTAAGTCCACCTTCTATCCCATTCATGTTAGATCGTTGAAACCCTGTATCCCTCTGAAACACTGGAAGAGCTAGTAAATTGTAAATGAAGTAATACTGTGTTCCTCTTGACTGTTATTTTTCTTAGTAGGGGGCCTTTGGAAGGCACTGTGAATTTGCTATTTTGATGTAGTGTTACAAGATGGAAAATTGATTCCTCTGACTTTGCTATTGATGTAGTGTGATAGAAAATTCACCCCTCTGAACTGGCTCCTTCCCAGTCAAGGTTATCTGGTTTGATTGTATAATTTGCACCAAGAAGTTAAAATGTTTTATGACTCTCTGTTCTGCTGACAGGCAGAGAGTCACATTGTGTAATTTAATTTCAGTCAGTCAATAGATGGCATCCCTCATCAGGGTTGCCAGATGGTGATAACAGTGTAAGGCCTTGGGTCTAAGGCATCCACGACTGGAAGGGACTACTGATGTTCTGTGATACATCAGGTTTCAGCACACAACTTACATTTCTTTGCCTCCAAATTGAGGCATTTATTATGATGTTCATACTTTCCCTCTTGTTTGAAAGTTTCTAATTATTAAATGGTGTCGGAATTGTTGTATTTTCCTTAGGAATTCAGTGGAACTTATCTTCATTAAATTTAGCTGGTACCAGGTTGATATGACTTGTCAATATTATGGTCAACTTTAAGTCTTAGTTTTCGTTTGTGCCTTTGATTAATAAGTATAACTCTTATACAATAAATACTGCTTTCCTCTAAAA