In [None]:
#@title mount your google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#@title change directory and import modules

import os
import numpy as np
import tensorflow as tf

os.chdir("/content/drive/MyDrive/Commit_Folder/shRNAI")

from shRNAI.module_simple import *

In [None]:
#@title download annotation files.

if not os.path.exists('./data'): os.makedirs('./data')
!wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_36/gencode.v36.basic.annotation.gtf.gz ./data/
!wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_36/gencode.v36.pc_transcripts.fa.gz ./data/

!gzip -d ./data/gencode.v36.basic.annotation.gtf.gz
!gzip -d ./data/gencode.v36.pc_transcripts.fa.gz

In [None]:
#@title specify gene name and its target region (CDS or 3UTR) 
region = 'CDS'
target = 'PTEN'

In [None]:
#@title  get annotation and sequence
seqF = os.getcwd() + '/data/gencode.v36.pc_transcripts.fa' 
annoF = os.getcwd() + '/data/gencode.v36.basic.annotation.gtf'

annoDic = get_Annotation(annoF)
seqDic, pairDic = get_Sequence(seqF, region, annoDic)


In [None]:
#@title  load models
model_22nt = tf.keras.models.load_model(os.getcwd() + '/models/22nt.h5')
model_pri  = tf.keras.models.load_model(os.getcwd() + '/models/pri.h5')


In [None]:
# calculate scores
total_score = dict(); priDic = dict()
for txnID in set(pairDic[target]):
    seq = seqDic[txnID]
    if len(seq) < 22: continue
    seqK, priK, onehotK, onehotK_pri = convert(seq)

    pris = model_pri.predict(onehotK_pri).reshape(-1,1)
    outs = model_22nt.predict([onehotK, pris]).flatten()

    for i in range(len(seqK)):
        seq = seqK[i]
        pri = priK[i]
        out = outs[i]

        if not seq in total_score: total_score[seq] = []
        total_score[seq].append(out)

        priDic[seq] = pri 




In [None]:
#@title  print top 5

hit_score = dict()
for seq in total_score:
    if len(total_score[seq]) == len(set(pairDic[target])):
        hit_score[seq] = total_score[seq][0]

hit_score_items = list(hit_score.items())
hit_score_items.sort(key=lambda row: row[1])

# select top 5
n = 0 
for seq, score in hit_score_items[-5:][::-1]:
    n += 1
    print("Top",n)
    print("Mature sequence:",seq)
    print("Score:", round(score,4))
    print("97mer:", priDic[seq])


Top 1
Mature sequence: ATAGTTTCAAACATCATCTTGT
Score: 0.7972
97mer: GGTATATTGCTGTTGACAGTGAGCGCCAAGATGATGTTTGAAACTATTAGTGAAGCCACAGATGTAATAGTTTCAAACATCATCTTGTTGCCTACTGCCTCGGAATTCAAGGG
Top 2
Mature sequence: TTCATTGTCACTAACATCTGGT
Score: 0.7761
97mer: GGTATATTGCTGTTGACAGTGAGCGCCCAGATGTTAGTGACAATGAATAGTGAAGCCACAGATGTATTCATTGTCACTAACATCTGGTTGCCTACTGCCTCGGAATTCAAGGG
Top 3
Mature sequence: TTTTTCTGAGGTTTCCTCTGGT
Score: 0.7557
97mer: GGTATATTGCTGTTGACAGTGAGCGCCCAGAGGAAACCTCAGAAAAATAGTGAAGCCACAGATGTATTTTTCTGAGGTTTCCTCTGGTTGCCTACTGCCTCGGAATTCAAGGG
Top 4
Mature sequence: TACATCATCAATATTGTTCCTG
Score: 0.7555
97mer: GGTATATTGCTGTTGACAGTGAGCGAAGGAACAATATTGATGATGTATAGTGAAGCCACAGATGTATACATCATCAATATTGTTCCTGTGCCTACTGCCTCGGAATTCAAGGG
Top 5
Mature sequence: TATATCTTCACCTTTAGCTGGC
Score: 0.7437
97mer: GGTATATTGCTGTTGACAGTGAGCGACCAGCTAAAGGTGAAGATATATAGTGAAGCCACAGATGTATATATCTTCACCTTTAGCTGGCTGCCTACTGCCTCGGAATTCAAGGG
