## abstract
- Goal of this notebook was to develope an importer that can read FASTA files into a pandas DataFrame
- FASTA data from https://ftp.ensembl.org/pub/

In [1]:
import pandas as pd
from time import time

In [2]:
def ensemble_fasta_to_dict(fasta_entry):
    data = {}
    m1 = fasta_entry.find(" ")
    m2 = fasta_entry.find(" ", m1+1)
    
    data["TRANSCRIPT_ID"]      = fasta_entry[   1:m1]
    data["SEQTYPE"]            = fasta_entry[m1+1:m2]
    
    m3 = fasta_entry.find(" chromosome:",         m2+1)
    m4 = fasta_entry.find(" gene:",               m3+1)
    m5 = fasta_entry.find(" gene_biotype:",       m4+1)
    m6 = fasta_entry.find(" transcript_biotype:", m5+1)
    m7 = fasta_entry.find(" description:",        m6+1)
    m8 = fasta_entry.find(" [Source:",            m7+1)
    m9 = fasta_entry.find("]",                    m8+1)
    
    data['LOCATION']             = fasta_entry[m3+12:m4]
    data['GENE_ID']              = fasta_entry[m4+6 :m5]
    data['GENE_BIOTYPE']         = fasta_entry[m5+14:m6]
    data['TRANSCRIPT_BIOTYPE']   = fasta_entry[m6+20:m7]
    data['DESCRIPTION']          = fasta_entry[m7+13:m8]
    data['SEQUENCE']             = fasta_entry[m9+1:]
    
    return data

def read_ensemble_fasta(file_path, timeit=True):
    if timeit: s = time()
    df = pd.DataFrame(columns = ['TRANSCRIPT_ID', 'SEQTYPE', 'LOCATION', 'GENE_ID', 'GENE_BIOTYPE', 'TRANSCRIPT_BIOTYPE', 'DESCRIPTION', 'SEQUENCE'])
    buffer = ""
    i = 0
    with open(file_path) as file:
        buffer = ""
        for line in file:
            i += 1
            buffer += line.rstrip()
            z = buffer.rfind(">")
            if z>0:
                entry = buffer[:z].replace("\n", "")
                buffer = buffer[z:]
                df = pd.concat([df, pd.DataFrame([ensemble_fasta_to_dict(entry)])], ignore_index=True)
                
    if timeit: print(f"{time()-s:.2g}s")
    return df


## transcriptome data

In [3]:
ncrna = read_ensemble_fasta("sequence_data/Saccharomyces_cerevisiae.R64-1-1.ncrna.fa")
cdna = read_ensemble_fasta("sequence_data/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa")
transcriptom = pd.concat([cdna, ncrna], ignore_index=True)
transcriptom

0.13s
2.4s


Unnamed: 0,TRANSCRIPT_ID,SEQTYPE,LOCATION,GENE_ID,GENE_BIOTYPE,TRANSCRIPT_BIOTYPE,DESCRIPTION,SEQUENCE
0,YPL071C_mRNA,cdna,NA cdna chromosome:R64-1-1:XVI:420048:420518:-1,YPL071C,protein_coding,protein_coding,Putative protein of unknown function; green fl...,ATGAGTTCCCGGTTTGCAAGAAGTAATGGCAATCCCAACCACATTA...
1,YLL050C_mRNA,cdna,NA cdna chromosome:R64-1-1:XII:39804:40414:-1,YLL050C,protein_coding,protein_coding gene_symbol:COF1,"Cofilin, involved in pH-dependent actin filame...",ATGTCTAGATCTGGTGTTGCTGTTGCTGATGAATCCCTTACCGCTT...
2,YMR172W_mRNA,cdna,NA cdna chromosome:R64-1-1:XIII:605981:608140:1,YMR172W,protein_coding,protein_coding gene_symbol:HOT1,Transcription factor for glycerol biosynthetic...,ATGTCTGGAATGGGTATTGCGATTCTTTGCATCGTACGTACAAAGA...
3,YOR185C_mRNA,cdna,NA cdna chromosome:R64-1-1:XV:681444:682106:-1,YOR185C,protein_coding,protein_coding gene_symbol:GSP2,GTP binding protein (mammalian Ranp homolog); ...,ATGTCAGCACCTGCTCAAAACAATGCCGAGGTTCCCACTTTCAAGT...
4,YLL032C_mRNA,cdna,NA cdna chromosome:R64-1-1:XII:74270:76747:-1,YLL032C,protein_coding,protein_coding,Protein of unknown function; may interact with...,ATGGATAACTTCAAAATTTACAGTACAGTTATCACAACTGCTTTTT...
...,...,...,...,...,...,...,...,...
7029,tI(AAU)E2_tRNA,ncrna,tRNA ncrna chromosome:R64-1-1:V:551285:551358:-1,tI(AAU)E2,tRNA,tRNA,"Isoleucine tRNA (tRNA-Ile), predicted by tRNAs...",GGTCTCTTGGCCCAGTTGGTTAAGGCACCGTGCTAATAACGCGGGG...
7030,tS(GCU)O_tRNA,ncrna,RNA ncrna chromosome:R64-1-1:XV:274673:274773:1,tS(GCU)O,tRNA,tRNA,"Serine tRNA (tRNA-Ser), predicted by tRNAscan-...",GTCCCAGTGGCCGAGTGGTTAAGGCGATGCCCTGCTAAGGCATTGG...
7031,tY(GUA)F1_tRNA,ncrna,tRNA ncrna chromosome:R64-1-1:VI:167437:167525:1,tY(GUA)F1,tRNA,tRNA gene_symbol:SUP11,"Tyrosine tRNA (tRNA-Tyr), predicted by tRNAsca...",CTCTCGGTAGCCAAGTTGGTTTAAGGCGCAAGACTGTAAATCTTGA...
7032,tG(UCC)O_tRNA,ncrna,RNA ncrna chromosome:R64-1-1:XV:110962:111033:-1,tG(UCC)O,tRNA,tRNA gene_symbol:SUF1,"Glycine tRNA (tRNA-Gly), predicted by tRNAscan...",GGGCGGTTAGTGTAGTGGTTATCATCCCACCCTTCCAAGGTGGGGA...


In [4]:
len(transcriptom.SEQUENCE.sum()), len(transcriptom), len(transcriptom.SEQUENCE.sum())/len(transcriptom)

(8855347, 7034, 1258.9347455217514)

## mCherry sequencing data

In [5]:
with open("sequence_data/iGem_mCherry-forward_M13.fasta") as file: content = [line for line in file]
target = {"NAME": content[0][1:], "SEQUENCE": content[1]}
target

{'NAME': 'iGem_mCherry-forward_M13\n',
 'SEQUENCE': 'GATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAAATTGATGAGCAATGCTTTTTTATAATGCCAACTTTGTACAAAAAAGCAGGCTACAAAATGGTGAGCAAGGGCGAGGAGGATAACATGGCCATCATCAAGGAGTTCATGCGCTTCAAGGTGCACATGGAGGGCTCCGTGAACGGCCACGAGTTCGAGATCGAGGGCGAGGGCGAGGGCCGCCCCTACGAGGGCACCCAGACCGCCAAGCTGAAGGTGACCAAGGGTGGCCCCCTGCCCTTCGCCTGGGACATCCTGTCCCCTCAGTTCATGTACGGCTCCAAGGCCTACGTGAAGCACCCCGCCGACATCCCCGACTACTTGAAGCTGTCCTTCCCCGAGGGCTTCAAGTGGGAGCGCGTGATGAACTTCGAGGACGGCGGCGTGGTGACCGTGACCCAGGACTCCTCCCTGCAGGACGGCGAGTTCATCTACAAGGTGAAGCTGCGCGGCACCAACTTCCCCTCCGACGGCCCCGTAATGCAGAAGAAGACCATGGGCTGGGAGGCCTCCTCCGAGCGGATGTACCCCGAGGACGGCGCCCTGAAGGGCGAGATCAAGCAGAGGCTGAAGCTGAAGGACGGCGGCCACTACGACGCTGAGGTCAAGACCACCTACAAGGCCAAGAAGCCCGTGCAGCTGCCCGGCGCCTACAACGTCAACATCAAGTTGGACATCACCTCCCACAACGAGGACTACACCATCGTGGAACAGTACGAACGCGCCGAGGGCCGCCACTCCACCGGCGGCATGGACGAGCTGTACAAGTAAGACCCAGCTTTCTTGTACAAAGTTGGCATTATAAGAAAGCATTGCTTATCAATTTGTTGCAACGAACAGGTCACTATCAGTCAAAATAAAATCATTATTTGCCATCCAGCTGATATCCCCTATAGTGAGTCGTATTACA