## abstract
- Goal of this notebook is to develop an importer that can read our FASTA files into a pandas DataFrame
- FASTA should be a standardized format, but as always everybody makes up their own flavour

In [1]:
from IPython.core.magic import register_cell_magic, register_line_magic

@register_cell_magic
def run_and_writefile(line, cell): # adapted from andrei-iatsuk copied from https://stackoverflow.com/questions/33358611/ipython-notebook-writefile-and-execute-cell-at-the-same-time
    argz = line.split()
    file = argz[-1]
    mode = 'a'
    if len(argz) == 2 and argz[0] == '-w':
        mode = 'w'
    content = cell[:cell.find("%stop_write")] + "\n\n"
    with open(file, mode) as f:
        f.write(content)
    get_ipython().run_cell(cell)

@register_line_magic
def stop_write(line):
    pass

In [2]:
%%run_and_writefile -w sequence_reader.py
import pandas as pd
from time import time

## Load ensembl transcriptome data
 - FASTA data from https://ftp.ensembl.org/pub/

In [3]:
file_path = "sequence_data/Saccharomyces_cerevisiae.R64-1-1.ncrna.fa"

i = 0
with open(file_path) as file:
    buffer = ""
    for line in file:
        i += 1
        if i > 20: 
            break; # can't and don't want to print all at once
        buffer += line.rstrip()
        z = buffer.rfind(">")
        if z>0:
            entry = buffer[:z].replace("\n", "")
            buffer = buffer[z:]
            print(entry, end = "\n\n")


>ETS2-2_rRNA ncrna chromosome:R64-1-1:XII:460712:460922:-1 gene:ETS2-2 gene_biotype:rRNA transcript_biotype:rRNA description:Non-coding region located adjacent to RDN25; transcribed as part of the 35S rRNA precursor transcript; contains the primary rRNA transcription termination site at +93 and a secondary termination site between +211 and +250 [Source:SGD;Acc:S000029713]TTTTTATTTCTTTCTAAGTGGGTACTGGCAGGAGCCGGGGCCTAGTTTAGAGAGAAGTAGACTGAACAAGTCTCTATAAATTTTATTTGTCTTAAGAATTCTATGATCCGGGTAAAAACATGTATTGTATATATCTATTATAATATACGATGAGGATGATAGTGTGTAAGAGTGTACCATTTACTAATGTATGTAAGTTACTATTTACTAT

>ITS2-2_rRNA ncrna chromosome:R64-1-1:XII:464319:464550:-1 gene:ITS2-2 gene_biotype:rRNA transcript_biotype:rRNA description:Non-coding region between RDN58 and RDN25; transcribed as part of the 35S rRNA precursor transcript; forms a stem-loop structure required for processing of the precursor transcript [Source:SGD;Acc:S000029712]CCTTCTCAAACATTCTGTTTGGTAGTGAGTGATACTCTTTGGAGTTAACTTGAAATTGCTGGCCTTTTCATTGGATGTTT

In [4]:
%%run_and_writefile sequence_reader.py
def _ensembl_fasta_to_dict(fasta_entry, debug = False):
    '''internal function to encode one line of fasta string into a dictionary'''
    data = {}
    m1 = fasta_entry.find(" ")
    m2 = fasta_entry.find(" chromosome:",         m1)
    m3 = fasta_entry.find(" gene:",               m2)
    m4 = fasta_entry.find(" gene_biotype:",       m3)
    m5 = fasta_entry.find(" transcript_biotype:", m4)
    m6 = fasta_entry.find(" description:",        m5)
    m7 = fasta_entry.find(" [Source:",            m6)
    m8 = fasta_entry.find("]",                    m7)

    if debug: print(f"m1:{m1}, m2:{m2}, m3:{m3}, m4:{m4}, m5:{m5}, m6:{m6}, m7:{m7}, m8:{m8}", end = "\n\n")
    
    data["SEQ_NAME"]             = fasta_entry[   1 :m1]
    data["SEQ_TYPE"]             = fasta_entry[m1+1 :m2]
    data['CHROMOSOME']           = fasta_entry[m2+12:m3]
    data['GENE_ID']              = fasta_entry[m3+6 :m4]
    data['GENE_BIOTYPE']         = fasta_entry[m4+14:m5]
    data['TRANSCRIPT_BIOTYPE']   = fasta_entry[m5+20:m6]
    data['DESCRIPTION']          = fasta_entry[m6+13:m7]
    data['SEQUENCE']             = fasta_entry[m7+1:]
    
    return data
%stop_write

i = 0
with open(file_path) as file:
    buffer = ""
    for line in file:
        i += 1
        if i > 20: 
            break;
        buffer += line.rstrip()
        z = buffer.rfind(">")
        if z>0:
            entry = buffer[:z].replace("\n", "")
            buffer = buffer[z:]
            print(entry, end = "\n\n")
            print(_ensembl_fasta_to_dict(entry, debug = True), end = "\n\n\n\n")


>ETS2-2_rRNA ncrna chromosome:R64-1-1:XII:460712:460922:-1 gene:ETS2-2 gene_biotype:rRNA transcript_biotype:rRNA description:Non-coding region located adjacent to RDN25; transcribed as part of the 35S rRNA precursor transcript; contains the primary rRNA transcription termination site at +93 and a secondary termination site between +211 and +250 [Source:SGD;Acc:S000029713]TTTTTATTTCTTTCTAAGTGGGTACTGGCAGGAGCCGGGGCCTAGTTTAGAGAGAAGTAGACTGAACAAGTCTCTATAAATTTTATTTGTCTTAAGAATTCTATGATCCGGGTAAAAACATGTATTGTATATATCTATTATAATATACGATGAGGATGATAGTGTGTAAGAGTGTACCATTTACTAATGTATGTAAGTTACTATTTACTAT

m1:12, m2:18, m3:58, m4:70, m5:88, m6:112, m7:346, m8:373

{'SEQ_NAME': 'ETS2-2_rRNA', 'SEQ_TYPE': 'ncrna', 'CHROMOSOME': 'R64-1-1:XII:460712:460922:-1', 'GENE_ID': 'ETS2-2', 'GENE_BIOTYPE': 'rRNA', 'TRANSCRIPT_BIOTYPE': 'rRNA', 'DESCRIPTION': 'Non-coding region located adjacent to RDN25; transcribed as part of the 35S rRNA precursor transcript; contains the primary rRNA transcription termination site at +93 a

In [5]:
%%run_and_writefile sequence_reader.py
def read_ensembl_fasta(file_path, timeit=False):
    '''reads a fasta file from in the format of ensembl.org and returns as pandas.DataFrane'''
    if timeit: s = time()
    df = pd.DataFrame(columns = ['SEQ_NAME', 'SEQ_TYPE', 'CHROMOSOME', 'GENE_ID', 'GENE_BIOTYPE', 'TRANSCRIPT_BIOTYPE', 'DESCRIPTION', 'SEQUENCE'])
    buffer = ""
    with open(file_path) as file:
        buffer = ""
        for line in file:
            buffer += line.rstrip()
            z = buffer.rfind(">")
            if z>0:
                entry = buffer[:z].replace("\n", "")
                buffer = buffer[z:]
                df = pd.concat([df, pd.DataFrame([_ensembl_fasta_to_dict(entry)])], ignore_index=True)
                
    if timeit: print(f"{time()-s:.2g}s")
    return df
%stop_write


ncrna = read_ensembl_fasta("sequence_data/Saccharomyces_cerevisiae.R64-1-1.ncrna.fa", True)
cdna = read_ensembl_fasta("sequence_data/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa", True)
transcriptom = pd.concat([cdna, ncrna], ignore_index=True)
transcriptom

0.14s
2.8s


Unnamed: 0,SEQ_NAME,SEQ_TYPE,CHROMOSOME,GENE_ID,GENE_BIOTYPE,TRANSCRIPT_BIOTYPE,DESCRIPTION,SEQUENCE
0,YPL071C_mRNA,cdna,R64-1-1:XVI:420048:420518:-1,YPL071C,protein_coding,protein_coding,Putative protein of unknown function; green fl...,[Source:SGD;Acc:S000005992]ATGAGTTCCCGGTTTGCAA...
1,YLL050C_mRNA,cdna,R64-1-1:XII:39804:40414:-1,YLL050C,protein_coding,protein_coding gene_symbol:COF1,"Cofilin, involved in pH-dependent actin filame...",[Source:SGD;Acc:S000003973]ATGTCTAGATCTGGTGTTG...
2,YMR172W_mRNA,cdna,R64-1-1:XIII:605981:608140:1,YMR172W,protein_coding,protein_coding gene_symbol:HOT1,Transcription factor for glycerol biosynthetic...,[Source:SGD;Acc:S000004783]ATGTCTGGAATGGGTATTG...
3,YOR185C_mRNA,cdna,R64-1-1:XV:681444:682106:-1,YOR185C,protein_coding,protein_coding gene_symbol:GSP2,GTP binding protein (mammalian Ranp homolog); ...,[Source:SGD;Acc:S000005711]ATGTCAGCACCTGCTCAAA...
4,YLL032C_mRNA,cdna,R64-1-1:XII:74270:76747:-1,YLL032C,protein_coding,protein_coding,Protein of unknown function; may interact with...,[Source:SGD;Acc:S000003955]ATGGATAACTTCAAAATTT...
...,...,...,...,...,...,...,...,...
7029,tI(AAU)E2_tRNA,ncrna,R64-1-1:V:551285:551358:-1,tI(AAU)E2,tRNA,tRNA,"Isoleucine tRNA (tRNA-Ile), predicted by tRNAs...",[Source:SGD;Acc:S000006603]GGTCTCTTGGCCCAGTTGG...
7030,tS(GCU)O_tRNA,ncrna,R64-1-1:XV:274673:274773:1,tS(GCU)O,tRNA,tRNA,"Serine tRNA (tRNA-Ser), predicted by tRNAscan-...",[Source:SGD;Acc:S000006733]GTCCCAGTGGCCGAGTGGT...
7031,tY(GUA)F1_tRNA,ncrna,R64-1-1:VI:167437:167525:1,tY(GUA)F1,tRNA,tRNA gene_symbol:SUP11,"Tyrosine tRNA (tRNA-Tyr), predicted by tRNAsca...",[Source:SGD;Acc:S000006779]CTCTCGGTAGCCAAGTTGG...
7032,tG(UCC)O_tRNA,ncrna,R64-1-1:XV:110962:111033:-1,tG(UCC)O,tRNA,tRNA gene_symbol:SUF1,"Glycine tRNA (tRNA-Gly), predicted by tRNAscan...",[Source:SGD;Acc:S000006592]GGGCGGTTAGTGTAGTGGT...


### Load mCherry sequencing data

In [6]:
with open("sequence_data/iGem_mCherry-forward_M13.fasta") as file: content = [line for line in file]
print(content)
{"SEQ_NAME": content[0][1:].rstrip(), "SEQUENCE": content[1]}

['>iGem_mCherry-forward_M13\n', 'GATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAAATTGATGAGCAATGCTTTTTTATAATGCCAACTTTGTACAAAAAAGCAGGCTACAAAATGGTGAGCAAGGGCGAGGAGGATAACATGGCCATCATCAAGGAGTTCATGCGCTTCAAGGTGCACATGGAGGGCTCCGTGAACGGCCACGAGTTCGAGATCGAGGGCGAGGGCGAGGGCCGCCCCTACGAGGGCACCCAGACCGCCAAGCTGAAGGTGACCAAGGGTGGCCCCCTGCCCTTCGCCTGGGACATCCTGTCCCCTCAGTTCATGTACGGCTCCAAGGCCTACGTGAAGCACCCCGCCGACATCCCCGACTACTTGAAGCTGTCCTTCCCCGAGGGCTTCAAGTGGGAGCGCGTGATGAACTTCGAGGACGGCGGCGTGGTGACCGTGACCCAGGACTCCTCCCTGCAGGACGGCGAGTTCATCTACAAGGTGAAGCTGCGCGGCACCAACTTCCCCTCCGACGGCCCCGTAATGCAGAAGAAGACCATGGGCTGGGAGGCCTCCTCCGAGCGGATGTACCCCGAGGACGGCGCCCTGAAGGGCGAGATCAAGCAGAGGCTGAAGCTGAAGGACGGCGGCCACTACGACGCTGAGGTCAAGACCACCTACAAGGCCAAGAAGCCCGTGCAGCTGCCCGGCGCCTACAACGTCAACATCAAGTTGGACATCACCTCCCACAACGAGGACTACACCATCGTGGAACAGTACGAACGCGCCGAGGGCCGCCACTCCACCGGCGGCATGGACGAGCTGTACAAGTAAGACCCAGCTTTCTTGTACAAAGTTGGCATTATAAGAAAGCATTGCTTATCAATTTGTTGCAACGAACAGGTCACTATCAGTCAAAATAAAATCATTATTTGCCATCCAGCTGATATCCCCTATAGTGAGTCGTATTACATGGTCATAGCTGTTTCCTGG

{'SEQ_NAME': 'iGem_mCherry-forward_M13',
 'SEQUENCE': 'GATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAAATTGATGAGCAATGCTTTTTTATAATGCCAACTTTGTACAAAAAAGCAGGCTACAAAATGGTGAGCAAGGGCGAGGAGGATAACATGGCCATCATCAAGGAGTTCATGCGCTTCAAGGTGCACATGGAGGGCTCCGTGAACGGCCACGAGTTCGAGATCGAGGGCGAGGGCGAGGGCCGCCCCTACGAGGGCACCCAGACCGCCAAGCTGAAGGTGACCAAGGGTGGCCCCCTGCCCTTCGCCTGGGACATCCTGTCCCCTCAGTTCATGTACGGCTCCAAGGCCTACGTGAAGCACCCCGCCGACATCCCCGACTACTTGAAGCTGTCCTTCCCCGAGGGCTTCAAGTGGGAGCGCGTGATGAACTTCGAGGACGGCGGCGTGGTGACCGTGACCCAGGACTCCTCCCTGCAGGACGGCGAGTTCATCTACAAGGTGAAGCTGCGCGGCACCAACTTCCCCTCCGACGGCCCCGTAATGCAGAAGAAGACCATGGGCTGGGAGGCCTCCTCCGAGCGGATGTACCCCGAGGACGGCGCCCTGAAGGGCGAGATCAAGCAGAGGCTGAAGCTGAAGGACGGCGGCCACTACGACGCTGAGGTCAAGACCACCTACAAGGCCAAGAAGCCCGTGCAGCTGCCCGGCGCCTACAACGTCAACATCAAGTTGGACATCACCTCCCACAACGAGGACTACACCATCGTGGAACAGTACGAACGCGCCGAGGGCCGCCACTCCACCGGCGGCATGGACGAGCTGTACAAGTAAGACCCAGCTTTCTTGTACAAAGTTGGCATTATAAGAAAGCATTGCTTATCAATTTGTTGCAACGAACAGGTCACTATCAGTCAAAATAAAATCATTATTTGCCATCCAGCTGATATCCCCTATAGTGAGTCGTATTA

In [7]:
%%run_and_writefile sequence_reader.py
def read_microsynth_fasta(file_path):
    """reads the name and sequence from fasta file for a sample sent to microsynth seqlab gmbh for sequencing, returns as pandas.Series"""
    with open(file_path) as file: content = [line for line in file]
    return pd.Series({"NAME": content[0][1:].rstrip(), "SEQUENCE": content[1]})
%stop_write
read_microsynth_fasta("sequence_data/iGem_mCherry-forward_M13.fasta")

NAME                                 iGem_mCherry-forward_M13
SEQUENCE    GATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAAATTGAT...
dtype: object