<a href="https://colab.research.google.com/github/Will-Raymond/human_riboswitch_hits/blob/main/add_nupack_dots_to_db.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creating the UTR data csv's

This notebook makes the UTR csv's used for machine learning, adds gene names, matches ccds's to UTRs, and generates dot structures with NUPACK

## Install NUPACK

To install nupack you must include its zip file in your google drive and locally install it after obtaining a liscence, the following code will install it to this colab session. For the manuscript nupack-4.0.0.23 was used.

https://www.nupack.org/download/overview

In [1]:
#@title Downgrade colab to python 3.8
%%capture
!wget -O mini.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_4.8.2-Linux-x86_64.sh
!chmod +x mini.sh
!bash ./mini.sh -b -f -p /usr/local
!conda install -q -y jupyter
!conda install -q -y google-colab -c conda-forge
!python -m ipykernel install --name "py38" --user

import sys
sys.path.append("/usr/local/lib/python3.8/site-packages/")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%%capture
!cp /content/drive/MyDrive/nupack-4.0.0.23.zip .
!unzip /content/nupack-4.0.0.23.zip

!cp /content/drive/MyDrive/utr_negative_aln_fa_top100.zip .
!unzip /content/utr_negative_aln_fa_top100.zip

!python -m pip install -U nupack -f /content/nupack-4.0.0.23/package
!python -m pip show nupack

In [4]:
!pip3 install biopython==1.79

Collecting biopython==1.79
  Downloading biopython-1.79-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/2.7 MB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.9/2.7 MB[0m [31m12.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.7/2.7 MB[0m [31m28.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.79


In [5]:
from nupack import *
import pandas as pd
from Bio import SeqIO
import os
os.getcwd()

'/content'

## Convert UTRdb 1.0 (2010) fasta file into a kmer csv

In [6]:
#@title Data processor
import numpy as np
import itertools as it
import re
import os

import pandas as pd
import matplotlib.pyplot as plt

import itertools
import warnings
import time


import joblib
import pickle



class kmer_DataProcessor():

    def __init__(self):
        self.unique_hits = set()
        self.data_file = ''
        self.all_fastas = []
        self.letter_dict = ['a','c','u','g']
        #preallocate small kmers
        self.kmer_4 = self.kmer_list(4)
        self.kmer_2 = self.kmer_list(2)
        self.kmer_3 = self.kmer_list(3)
        self.kmer_5 = self.kmer_list(5)
        self.kmer_1 = self.kmer_list(1)

        self.ns_dict = {'m':'a','w':'a','r':'g','y':'t','k':'g','s':'g','w':'a','h':'a','n':'a','x':'a'}

        self.test_seq = 'aucuguacguacguaucgaucguguacuggcaaaacguaguagcugagcaucaucuaugh'






    def create_database(self,path, u_thresh = 1, disp = False):
        '''
        Pull all sequences and throw out everything with a given threshold, if
        u_thresh is 100% match it is fast
        otherwise this is a very slow function that needs alignment each iteration
        '''

        self.data_file = path
        self.all_fastas = list(SeqIO.parse(path,'fasta'))
        print('processing sequences')
        n = 0
        m = len(self.all_fastas)
        for f in self.all_fastas:
            n+= 1
            if n% 100 == 0:
                if disp:
                    print('processed: %d out of %d'%(n,m))
            if u_thresh == 1:
                newstr = self.clean_seq(str(f.seq))

                if newstr not in self.unique_hits:
                    self.unique_hits.add((f.id + '==='+newstr))

            else:
                newstr = self.clean_seq(str(f.seq))
                best_match = self.check_percentage_alignments(newstr)

                if best_match < u_thresh:

                    self.unique_hits.add((f.id + '==='+newstr))

        self.unique_hits = list(self.unique_hits)
        self.unique_ids = []
        for entry in self.unique_hits:
            self.unique_ids.append(entry.split('===')[0])

        self.get_unique_seqs()


    def reset_db(self,  u_thresh = 1,disp=False):
        print('resetting sequences')
        self.unique_hits = set()
        n = 0
        m = len(self.all_fastas)
        for f in self.all_fastas:
            n+= 1
            if n% 100 == 0:
                if disp:
                    print('processed: %d out of %d'%(n,m))
            if u_thresh == 1:
                newstr = self.clean_seq(str(f.seq))

                if newstr not in self.unique_hits:
                    self.unique_hits.add((f.id + '==='+newstr))

            else:
                newstr = self.clean_seq(str(f.seq))
                best_match = self.check_percentage_alignments(newstr)

                if best_match < u_thresh:

                    self.unique_hits.add((f.id + '==='+newstr))

        self.unique_hits = list(self.unique_hits)
        self.unique_ids = []
        for entry in self.unique_hits:
            self.unique_ids.append(entry.split('===')[0])

        self.get_unique_seqs()


    def check_debruin_degeneracy():
        x=1

    def clean_seq(self,seq):
        '''
        clean the sequences to lowercase only a, u, g, c
        '''
        seq = seq.lower()

        for key in self.ns_dict.keys():
            seq = seq.replace(key,self.ns_dict[key])

        seq = seq.replace('t','u')
        return seq


    def get_all_kmers(self,k):
        '''
        build the full array of kmers for the current database
        '''
        self.kmer_database = []
        for entry in self.unique_hits:
            kf = self.kmer_freq(entry.split('===')[1],k)
            self.kmer_database.append(kf)

        self.kmer_array = np.array(self.kmer_database)


    def get_all_kmers_length_dist(self,k,density,bins):
        '''
        build the full array of kmers for the current database
        '''
        self.kmer_database = []

        cum_density = np.cumsum(density)

        self.normalized_sizes = []
        self.normalized_seqs = []
        for entry in self.unique_hits:


            r = np.random.uniform()

            while len(np.where(r < cum_density)[0]) ==0:
                r = np.random.uniform()

            index = np.where(r < cum_density)[0][0]

            length = int(bins[index])
            seq = entry.split('===')[1]

            if len(seq) > length:
                offset = int(np.random.uniform(0,len(seq)- length-1 ))
            else:
                length = len(seq)
                offset = 0

            #print(len(seq[offset:(offset+ length)]))
            kf = self.kmer_freq(seq[offset:(offset+ length)],k)
            self.kmer_database.append(kf)
            self.normalized_sizes.append( len( seq[offset:(offset+ length)] ) )
            self.normalized_seqs.append(seq[offset:(offset+ length)])



        self.kmer_array = np.array(self.kmer_database)


    def get_all_kmers_norm(self,k):
        '''
        build the full array of kmers for the current database
        '''
        self.kmer_database_norm = []
        for entry in self.unique_hits:
            kf = self.kmer_freq(entry.split('===')[1],k)
            seq_length = len(entry.split('===')[1])
            self.kmer_database_norm.append(kf/seq_length)

        self.kmer_array_norm = np.array(self.kmer_database_norm)


    def remove_entries(self,id_list):
        to_remove_indexes = []
        new_all_fastas = []

        for i in range(len(self.all_fastas)):
            if self.all_fastas[i].id not in id_list:
                new_all_fastas.append(self.all_fastas[i])

        self.all_fastas = new_all_fastas



    def export_to_csv(self,filename,normalized=False):

        if normalized:
            k_arr = self.kmer_array_norm
            k_db = self.kmer_database_norm
        else:
            k_arr = self.kmer_array
            k_db = self.kmer_database

        k = k_arr.shape[1]
        n=1
        while k !=4:
            n+=1
            k= k/4
        k=n
        print(k)
        if k > 5:
            kmer_ind = self.kmer_list(k)
        else:
            if k == 1:
                kmer_ind = self.kmer_1
            if k == 2:
                kmer_ind = self.kmer_2
            if k == 3:
                kmer_ind = self.kmer_3
            if k == 4:
                kmer_ind = self.kmer_4
            if k == 5:
                kmer_ind = self.kmer_5

        df = pd.DataFrame(k_db, columns = kmer_ind )
        df.insert(0,"ID",self.unique_ids,True)

        unique_seqs = []
        for entry in self.unique_hits:
            unique_seqs.append(entry.split('===')[1])


        df.insert(1,"SEQ",unique_seqs,True)
        df.to_csv((filename + '.csv'))

    def get_unique_seqs(self):
        unique_seqs = []
        for entry in self.unique_hits:
            unique_seqs.append(entry.split('===')[1])
        self.unique_seqs = unique_seqs
        self.get_sizes()

    def get_sizes(self):
        sizes = []
        for seq in self.unique_seqs:
            sizes.append(len(seq))
        self.all_sizes = sizes

    def check_percentage_alignments(self,seq):
        '''
        use global alignment to get percentage match to database

        EXTREMELY SLOW
        '''
        max_align = 0
        lenseq = len(seq)
        for seq2 in self.unique_hits:
            aligns = pairwise2.align.globalxx(seq, seq2)
            best_match = 0
            for align in aligns:
                if align[2] > best_match:
                    best_match = align[2]

            percentage_match = best_match/lenseq
            if percentage_match > max_align:
                max_align = percentage_match

        return max_align


    def check_perecentage_via_freq(self,freq,thresh):
        '''
        check percentage of similarities of given kmer freq to the current database

        fast but inaccurate if the sequences have different sizes

        '''
        similarities = []
        for kmer in self.kmer_database:
            similarities.append(np.mean( freq != kmer ))

        match_inds = np.where(np.array(similarities) < thresh)[0]
        return similarities, match_inds


    def kmer_list(self,k):
        combos =[x for x in it.product(self.letter_dict, repeat=k)]
        kmer = [''.join(y) for y in combos]
        return kmer

    def kmer_freq(self,seq,k):
        '''
        calculate the kmer frequences of k size for seq
        '''

        if k > 5:
            kmer_ind = self.kmer_list(k)
        else:
            if k == 1:
                kmer_ind = self.kmer_1
            if k == 2:
                kmer_ind = self.kmer_2
            if k == 3:
                kmer_ind = self.kmer_3
            if k == 4:
                kmer_ind = self.kmer_4
            if k == 5:
                kmer_ind = self.kmer_5

        kmer_freq_vec = np.zeros((4**k)).astype(int)
        for i in range(len(seq)-k):
            kmer_freq_vec[kmer_ind.index(seq[i:i+k])] += 1

        return kmer_freq_vec

In [7]:
utr5_db = kmer_DataProcessor()
utr5_db.create_database('/content/drive/MyDrive/5UTRaspic.Hum.fasta') # original file from 2010 UTRdb
utr5_db.get_all_kmers(3)
utr5_db.export_to_csv('/content/drive/MyDrive/test.csv')

processing sequences
3


## Select csv to edit



In [None]:
db_file = "/content/drive/MyDrive/test.csv.csv" # @param {type:"string"}

## Match Genes to UTR 1.0 ids
 only required for old UTR database, gene names were scraped from the old UTRd

In [None]:
import json
utr_id_to_gene = json.load(open('/content/drive/MyDrive/UTR_ID_to_gene.json','r'))
UTR_db = pd.read_csv(db_file)
UTR_db['GENE'] =''
g_list = []
for i in range(len(UTR_db)):
  g_list.append(utr_id_to_gene[UTR_db['ID'].iloc[i]])
UTR_db['GENE'] = g_list
#parse out duplicates
cols = UTR_db.columns.tolist()
cols =  [cols[-1]] + cols[1:3]  + cols[3:-1]
UTR_db = UTR_db[cols]
UTR_db = UTR_db[UTR_db['GENE'] != 'duplicate']
UTR_db = UTR_db.reset_index(drop=True) #reset the indexes
UTR_db

Unnamed: 0,GENE,ID,SEQ,aaa,aac,aau,aag,aca,acc,acu,...,gcu,gcg,gua,guc,guu,gug,gga,ggc,ggu,ggg
0,COBL,5HSAA023573,ccgcccgcuagagacgauuccagccccuuccguccgccgcgcuccg...,3,0,1,1,0,1,1,...,3,11,1,4,0,3,4,9,3,4
1,SPATA18,5HSAA102977,ggguaucuauggccgggcucaggcggcugcuggggagccaggagac...,1,3,1,2,2,8,0,...,6,12,1,4,0,2,9,14,4,11
2,ITGB2,5HSAA054124,agaaagaagaguggcaugcuuugacagcaaguggacuccgagucca...,2,1,0,5,3,4,4,...,3,1,1,2,2,4,9,3,2,4
3,ZBP1,5HSAA120916,cagagcugcaagaagcaccaggcucggccacuucagaagccccagc...,0,0,0,5,2,4,2,...,4,0,0,2,0,1,0,3,0,1
4,ARFGAP1,5HSAA005989,acccgcaccccggaucuggaaucucccaccgguuacgggaguccag...,0,2,1,2,4,7,1,...,3,2,0,5,1,2,7,4,4,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55491,2-Mar,5HSAA063529,ggugcccggacgcaggugccggccggagcggagcuaguggcgccga...,0,2,0,0,0,3,0,...,3,5,0,0,0,5,7,16,3,11
55492,TBCE,5HSAA107408,agcacaagauucucuacacuuaacaaauggcugccggaaguaucug...,1,1,1,4,4,1,1,...,6,1,2,1,0,3,4,3,2,2
55493,SLCO2B1,5HSAA100847,gauaaaguacucccaggaaggcuuugagccuuggcagaaaaggcug...,5,1,0,6,1,4,1,...,4,1,3,0,0,3,6,6,4,5
55494,SNIP1,5HSAA101912,gauuucugggagccugggcaccccggaagcggaaguccaggaguua...,3,2,1,2,1,1,2,...,2,1,0,2,2,0,5,1,0,2


## Match CCDS to Genes

In [None]:
ccds_file = "/content/drive/MyDrive/ccds_2018/CCDS_nucleotide.current.fna" # @param {type:"string"}
ccds_text_file = "/content/drive/MyDrive/ccds_2018/CCDS.current.txt" # @param {type:"string"}

In [None]:
UTR_db['CCDS_ID'] = ''
UTR_db['CCDS'] = ''

In [None]:
from Bio import SeqIO
from tqdm.notebook import tqdm

ccds_attributes = pd.read_csv(ccds_text_file, delimiter='\t')

r = []
for record in tqdm(SeqIO.parse(ccds_file,'fasta')):
    r.append(record)

ccds_list = ccds_attributes['ccds_id'].values
for record in tqdm(r):
    ccds_id = record.id.split('|')[0]
    if ccds_id in ccds_list:
        if ccds_attributes[ccds_attributes['ccds_id'] == ccds_id]['ccds_status'].values[0] != 'Withdrawn':

            gene = ccds_attributes[ccds_attributes['ccds_id'] == ccds_id]['gene'].values[0]
            if len(UTR_db[UTR_db['GENE'] == gene]['CCDS_ID']) != 0:
                #new_utr_data[new_utr_data['GENE'] == gene[0]]['CCDS_ID'] = record.id.split('|')[0]
                for ind in UTR_db[UTR_db['GENE'] == gene]['CCDS'].index:
                    UTR_db.iloc[ind,-1] = str(record.seq).lower().replace('t','u')
                    UTR_db.iloc[ind,-2] = ccds_id


UTR_db['STARTPLUS25'] = ''
UTR_db['NUPACK_25'] = ''
UTR_db['NUPACK_25_MFE'] = ''
cols = UTR_db.columns.tolist()
cols =  cols[0:3]  + [cols[-3]] + [cols[-2]]+ [cols[-1]] + [cols[-5]] + [cols[-4]]+cols[3:-5]
UTR_db = UTR_db[cols]
UTR_db

0it [00:00, ?it/s]

  0%|          | 0/33420 [00:00<?, ?it/s]

Unnamed: 0,GENE,ID,SEQ,STARTPLUS25,NUPACK_25,NUPACK_25_MFE,CCDS_ID,CCDS,aaa,aac,...,gcu,gcg,gua,guc,guu,gug,gga,ggc,ggu,ggg
0,COBL,5HSAA023573,ccgcccgcuagagacgauuccagccccuuccguccgccgcgcuccg...,,,,CCDS87505.1,auggacgcgccgcgcgccucggcggccaagcccccgaccgggagga...,3,0,...,3,11,1,4,0,3,4,9,3,4
1,SPATA18,5HSAA102977,ggguaucuauggccgggcucaggcggcugcuggggagccaggagac...,,,,CCDS75124.1,auggcggaaaaccugaaaagacuggucucaaacgaaacuuuacgaa...,1,3,...,6,12,1,4,0,2,9,14,4,11
2,ITGB2,5HSAA054124,agaaagaagaguggcaugcuuugacagcaaguggacuccgagucca...,,,,CCDS13716.1,augcugggccugcgccccccacugcucgcccugguggggcugcucu...,2,1,...,3,1,1,2,2,4,9,3,2,4
3,ZBP1,5HSAA120916,cagagcugcaagaagcaccaggcucggccacuucagaagccccagc...,,,,CCDS54478.1,auggcccaggcuccugcugacccgggcagagaaggccaccuugaac...,0,0,...,4,0,0,2,0,1,0,3,0,1
4,ARFGAP1,5HSAA005989,acccgcaccccggaucuggaaucucccaccgguuacgggaguccag...,,,,CCDS63328.1,augcuaaguuccgagaguuccuggagucucaggaggauuacgaucc...,0,2,...,3,2,0,5,1,2,7,4,4,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55491,2-Mar,5HSAA063529,ggugcccggacgcaggugccggccggagcggagcuaguggcgccga...,,,,,,0,2,...,3,5,0,0,0,5,7,16,3,11
55492,TBCE,5HSAA107408,agcacaagauucucuacacuuaacaaauggcugccggaaguaucug...,,,,CCDS86060.1,auguauccaugugaacucuguuuuucaugcagucagcugagcaagu...,1,1,...,6,1,2,1,0,3,4,3,2,2
55493,SLCO2B1,5HSAA100847,gauaaaguacucccaggaaggcuuugagccuuggcagaaaaggcug...,,,,CCDS53679.1,augggacccaggauagaggauaugccacaggacuucaaggcuuccc...,5,1,...,4,1,3,0,0,3,6,6,4,5
55494,SNIP1,5HSAA101912,gauuucugggagccugggcaccccggaagcggaaguccaggaguua...,,,,CCDS419.1,augaaggcggugaagagcgaacgggagcgagggagccggcgaagac...,3,2,...,2,1,0,2,2,0,5,1,0,2


In [None]:
print('MATCHED CCDSs:')
print(len(UTR_db[UTR_db['CCDS'] != '']))

MATCHED CCDSs:
48171


In [None]:
UTR_db.tail()

Unnamed: 0,GENE,ID,SEQ,STARTPLUS25,NUPACK_25,NUPACK_25_MFE,CCDS_ID,CCDS,aaa,aac,...,gcu,gcg,gua,guc,guu,gug,gga,ggc,ggu,ggg
55491,2-Mar,5HSAA063529,ggugcccggacgcaggugccggccggagcggagcuaguggcgccga...,,,,,,0,2,...,3,5,0,0,0,5,7,16,3,11
55492,TBCE,5HSAA107408,agcacaagauucucuacacuuaacaaauggcugccggaaguaucug...,,,,CCDS86060.1,auguauccaugugaacucuguuuuucaugcagucagcugagcaagu...,1,1,...,6,1,2,1,0,3,4,3,2,2
55493,SLCO2B1,5HSAA100847,gauaaaguacucccaggaaggcuuugagccuuggcagaaaaggcug...,,,,CCDS53679.1,augggacccaggauagaggauaugccacaggacuucaaggcuuccc...,5,1,...,4,1,3,0,0,3,6,6,4,5
55494,SNIP1,5HSAA101912,gauuucugggagccugggcaccccggaagcggaaguccaggaguua...,,,,CCDS419.1,augaaggcggugaagagcgaacgggagcgagggagccggcgaagac...,3,2,...,2,1,0,2,2,0,5,1,0,2
55495,GLE1,5HSAA044320,gcgaccgaaaccuucucccuagccguuggcacguucugcucccggg...,,,,CCDS35154.1,augccgucugagggucgcugcugggagaccuugaaggcccuacgca...,1,4,...,2,1,0,1,3,3,1,1,2,1


In [None]:
#@title Nupack seq to dot function
def get_mfe_nupack(seq):

  model1 = Model(material='rna', celsius=37)
  example_hit = seq
  example_hit = Strand(example_hit, name='example_hit')
  t1 = Tube(strands={example_hit: 1e-8}, complexes=SetSpec(max_size=1), name='t1')
  hit_results = tube_analysis(tubes=[t1], model=model1,
      compute=['pairs', 'mfe', 'sample', 'ensemble_size'],
      options={'num_sample': 100}) # max_size=1 default
  mfe = hit_results[list(hit_results.complexes.keys())[0]].mfe
  return mfe, hit_results



## Select columns to add dot structures too

In [None]:
columns = "NUPACK_25,NUPACK25_MFE" # @param {type:"string"}
sequence_column = "STARTPLUS25" # @param {type:"string"}

# Run NUPACK and get dot structures

In [None]:
import pandas as pd
energies = []
dots = []
mfes = []
##UTR_db = pd.read_csv('/content/drive/MyDrive/5primeUTR_newutrdb_ML.csv')
#UTR_db.head()

from tqdm.notebook import trange, tqdm

k = 0

max_window = 300
for i in trange(0,len(UTR_db)):
  utr_seq = UTR_db['SEQ'][i]
  if len(UTR_db['CCDS'][i]) != 0:
    ccds = UTR_db['CCDS'][i]
    mature_mrna = utr_seq + ccds
    ### UTR + 25 NT near start
    if len(utr_seq) > max_window-25:
      seq = utr_seq[-max_window+25:] + ccds[:25]
    else:
      seq = utr_seq + ccds[:25]
    mfe,hr =  get_mfe_nupack(seq)
    mfes.append(mfe)
    UTR_db.iloc[i,3] = seq
    UTR_db.iloc[i,4] = str(mfe[0][0])
    UTR_db.iloc[i,5]  = mfe[0][1]
  k+=1




In [None]:
UTR_db

Unnamed: 0,GENE,ID,SEQ,STARTPLUS25,NUPACK_25,NUPACK_25_MFE,aaa,aac,aau,aag,...,gua,guc,guu,gug,gga,ggc,ggu,ggg,CCDS_ID,CCDS
0,COBL,5HSAA023573,ccgcccgcuagagacgauuccagccccuuccguccgccgcgcuccg...,,,ccgcccgcuagagacgauuccagccccuuccguccgccgcgcuccg...,..(((((((.(((.....((..((..((..(..(((((((.(.(((...,-101.721931,1,1,...,1,4,0,3,4,9,3,4,CCDS87505.1,auggacgcgccgcgcgccucggcggccaagcccccgaccgggagga...
1,SPATA18,5HSAA102977,ggguaucuauggccgggcucaggcggcugcuggggagccaggagac...,,,uaucuauggccgggcucaggcggcugcuggggagccaggagaccgc...,(((((...((((..(((..((((((.((((....)))).)).))))...,-117.954300,1,2,...,1,4,0,2,9,14,4,11,CCDS75124.1,auggcggaaaaccugaaaagacuggucucaaacgaaacuuuacgaa...
2,ITGB2,5HSAA054124,agaaagaagaguggcaugcuuugacagcaaguggacuccgagucca...,,,agaaagaagaguggcaugcuuugacagcaaguggacuccgagucca...,........((((.((.((((.....)))).))..))))...((((....,-73.670624,0,5,...,1,2,2,4,9,3,2,4,CCDS13716.1,augcugggccugcgccccccacugcucgcccugguggggcugcucu...
3,ZBP1,5HSAA120916,cagagcugcaagaagcaccaggcucggccacuucagaagccccagc...,,,cagagcugcaagaagcaccaggcucggccacuucagaagccccagc...,....(((......)))...(((((.(((..........)))..)))...,-51.613644,0,5,...,0,2,0,1,0,3,0,1,CCDS54478.1,auggcccaggcuccugcugacccgggcagagaaggccaccuugaac...
4,ARFGAP1,5HSAA005989,acccgcaccccggaucuggaaucucccaccgguuacgggaguccag...,,,acccgcaccccggaucuggaaucucccaccgguuacgggaguccag...,..(((.....)))..(((((..(((((.........))))))))))...,-83.217552,1,2,...,0,5,1,2,7,4,4,6,CCDS63328.1,augcuaaguuccgagaguuccuggagucucaggaggauuacgaucc...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55491,2-Mar,5HSAA063529,ggugcccggacgcaggugccggccggagcggagcuaguggcgccga...,,,,0,2.000000,0,0,...,0,0,0,5,7,16,3,11,,
55492,TBCE,5HSAA107408,agcacaagauucucuacacuuaacaaauggcugccggaaguaucug...,,,,1,1.000000,1,4,...,2,1,0,3,4,3,2,2,CCDS86060.1,auguauccaugugaacucuguuuuucaugcagucagcugagcaagu...
55493,SLCO2B1,5HSAA100847,gauaaaguacucccaggaaggcuuugagccuuggcagaaaaggcug...,,,,5,1.000000,0,6,...,3,0,0,3,6,6,4,5,CCDS53679.1,augggacccaggauagaggauaugccacaggacuucaaggcuuccc...
55494,SNIP1,5HSAA101912,gauuucugggagccugggcaccccggaagcggaaguccaggaguua...,,,,3,2.000000,1,2,...,0,2,2,0,5,1,0,2,CCDS419.1,augaaggcggugaagagcgaacgggagcgagggagccggcgaagac...


## Save the file

In [None]:
UTR_db.to_csv('/content/drive/MyDrive/5primeUTR.csv')