In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from collections import defaultdict
import gzip
import numpy as np
from tqdm import tqdm

# one-hot encoder

def encode_one_hot(train_set, classes="AGCT"):
    encoder = defaultdict(lambda: np.array([0]*len(classes)))
        
    for i, _class in enumerate(classes):
        tmp = np.zeros(len(classes))
        tmp[i] = 1
        encoder[_class] = tmp
        
        
    output = []
    for record in tqdm(train_set):
        encoded_record = []
        for c in record.upper():
            encoded_record.append(encoder[c])
            
        encoded_record = np.ravel(encoded_record)
        output.append(encoded_record)
        
    output = np.array(output)
    
    return output

In [5]:
# Parsing
import gzip
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# single dataset version
def get_dataset(path):
    dataset =  gzip.open(path, "r")
    output = []    
    for i, record in tqdm(enumerate(dataset)):
        record = record.decode()
        record = record[:-1]
        record = record.split('\t')
        if i != 0:
            output.append(record)
    output = np.array(output).reshape([-1, 4])
    
    return encode_one_hot(output[:, 2]), np.array([int(i) for i in output[:, 3]]).flatten()

x1, y1 = get_dataset("/content/drive/Shareddrives/GP/Aptamer/data/DeepBind/Alx1_DBD_TAAAGC20NCG_3_Z_A.seq.gz")
x2, y2 = get_dataset("/content/drive/Shareddrives/GP/Aptamer/data/DeepBind/Alx1_DBD_TAAAGC20NCG_3_Z_B.seq.gz")
x = np.append(x1, x2, axis=0)
y = np.append(y1, y2, axis=0)

# 데이터셋 이름 변경 (valid -> test), validation을 진행 안하네?
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state=112)

##### 삭제 ######
#train_x = train_x.reshape(train_x.shape[0], (train_x.shape[1]*train_x.shape[2]))
#valid_x = valid_x.reshape(valid_x.shape[0], (valid_x.shape[1]*valid_x.shape[2]))
#del(y)
#################

#reshape 안해줬는데 어떻게 80이뜨지?
print(train_x.shape)
print(train_y.shape)

print(test_x.shape)
print(test_y.shape)

del(y)

128013it [00:00, 147431.49it/s]
100%|██████████| 128012/128012 [00:02<00:00, 63453.11it/s]
255509it [00:01, 199472.33it/s]
100%|██████████| 255508/255508 [00:04<00:00, 63640.55it/s]


(306816, 80)
(306816,)
(76704, 80)
(76704,)


In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, roc_auc_score


tree = DecisionTreeClassifier(max_depth=None,
                              criterion='entropy',
                              random_state=1)



#AdaBoost
AdaBoost = AdaBoostClassifier(base_estimator=tree,
                            n_estimators=100,
                            learning_rate=0.1,
                            random_state=1)

AdaBoost.fit(train_x, train_y)
y_pred = AdaBoost.predict(test_x)

In [7]:
score = metrics.roc_auc_score(test_y, y_pred)
print(f'AdaBoost Score : {score}')
score1 = classification_report(test_y, y_pred, digits=5)  #모델 검사
print(score1)


AdaBoost Score : 0.8379406078386383
              precision    recall  f1-score   support

           0    0.79054   0.77893   0.78469     25544
           1    0.89042   0.89695   0.89368     51160

    accuracy                        0.85765     76704
   macro avg    0.84048   0.83794   0.83918     76704
weighted avg    0.85716   0.85765   0.85738     76704



In [8]:
#Gradient Boosting
GradientBoost = GradientBoostingClassifier(random_state=1)
GradientBoost.fit(train_x, train_y)
y_pred = GradientBoost.predict(test_x)

In [9]:
score = metrics.roc_auc_score(test_y, y_pred)
print(f'GradientBoost Score : {score}')
score2 = classification_report(test_y, y_pred, digits=5)  #모델 검사
print(score2)

GradientBoost Score : 0.715049161978889
              precision    recall  f1-score   support

           0    0.81824   0.48375   0.60803     25544
           1    0.78593   0.94634   0.85871     51160

    accuracy                        0.79229     76704
   macro avg    0.80208   0.71505   0.73337     76704
weighted avg    0.79669   0.79229   0.77523     76704



In [10]:
#onehot_decoder
def onehot_decoder(seq, word=['A','G','C','T']):
    out = ""
    
    for i in range(9, 29):
        arg = np.argmax(seq[i])
        out += word[arg]
    #out+="/"
        
    return out

In [31]:
#Genetic Algorithm
 
#Initialize
def init():
  pool = []
  for i in range(30):   #유전자풀은 30개, 랜덤으로 초기화함
    seq = []
    for i in range(9):
        seq.append([.25, .25, .25, .25])
    for i in range(20):
        acid = [0, 0, 0, 0]
        n = np.random.randint(0,4)
        acid[n] = 1
        seq.append(acid)
    for i in range(9):
        seq.append([.25, .25, .25, .25])
    pool.append(seq)
  return np.array(pool)
 
#fitness
def fitness(model, pool):
  scores = [0]*30
  for i in range(30):
    proba = model.predict_proba([pool[i][9:29].ravel()])
    scores[i] = proba[0,1]
  
  return scores
 
#selection
def selection(pool, scores):
  temp = []
  while len(temp) < 8:  #적합도가 가장 높은 8개의 유전자를 선정
    index = np.argmax(scores)
    if pool[index] not in temp:
      temp.append(pool[index])
    del scores[index]
    del pool[index]
 
  p1 = temp[0][:] #4개의 부모 유전자
  p2 = temp[1][:]
  p3 = temp[2][:]
  p4 = temp[3][:]
  x = np.random.randint(9,29) #교차 위치 선정
  c1 = p1[:x] + p2[x:] #4개의 자식 유전자 생성
  c2 = p2[:x] + p1[x:]
  x = np.random.randint(9,29)
  c3 = p3[:x] + p4[x:]
  c4 = p4[:x] + p3[x:]
  temp.append(c1)
  temp.append(c2)
  temp.append(c3)
  temp.append(c4)
 
  n = np.random.randint(10,16)  #부모 및 자식의 돌연변이 갯수 선정
  for i in range(n):
    index = np.random.randint(0,12)  #오리지날 부모 및 자식 중 하나 선택
    temp_seq = temp[index][:]
    pos_a = np.random.randint(9,29) #돌연변이 시작위치 선정
    pos_b = np.random.randint(pos_a,29) #돌연변이 끝 위치 선정
    for j in range(pos_a, pos_b+1): #pos_a 부터 pos_b까지의 모든 아미노산을 돌연변이 시킴
      temp_acid = [0, 0, 0, 0]
      r = np.random.randint(0,4)  #돌연변이 값 (A,G,T,C 중 하나) 선정
      temp_acid[r] = 1
      temp_seq[j] = temp_acid[:]
 
    temp.append(temp_seq)
 
  for i in range(18-n): #부모와 자식, 돌연변이 외 나머지는 완전 무작위로 sequence를 생성
    seq = []
    for i in range(9):
        seq.append([.25, .25, .25, .25])
    for i in range(20):
        acid = [0, 0, 0, 0]
        n = np.random.randint(0,4)
        acid[n] = 1
        seq.append(acid)
    for i in range(9):
        seq.append([.25, .25, .25, .25])

    temp.append(seq)
 
  return np.array(temp)

In [32]:
import tensorflow as tf
import operator
# load model
deepbind = tf.keras.models.load_model("/content/CNN_Final_Alx1.h5")

In [35]:
def scoring(model):
  deepScores = []
  for i in tqdm(range(20)):
    pool = init()
    for j in range(50):
      scores = fitness(model, pool)
      pool = selection(pool.tolist(), scores)[:]
    
    scores = fitness(model, pool)
    index = np.argmax(scores)
    most_seq = pool[index]
    decode_seq = onehot_decoder(most_seq)
    deep_score = deepbind.predict(most_seq.reshape(1,38,4))[0][0]
    print(f'max score sequence : {decode_seq}, deepbind score : {deep_score}')
    deepScores.append([decode_seq, deep_score])

  return deepScores

In [36]:
AdaScores = scoring(AdaBoost)

  5%|▌         | 1/20 [00:33<10:33, 33.32s/it]

max score sequence : CAGCAGCAGTAACTCAATCA, deepbind score : 0.9954578876495361


 10%|█         | 2/20 [01:06<09:56, 33.12s/it]

max score sequence : GAATCAACGGTGATTAAGAT, deepbind score : 0.8401716351509094


 15%|█▌        | 3/20 [01:40<09:29, 33.51s/it]

max score sequence : GAGTTGGTAATTGGATTAGA, deepbind score : 0.9998598098754883


 20%|██        | 4/20 [02:13<08:57, 33.59s/it]

max score sequence : ACGAATAATGTAATTACTGA, deepbind score : 0.9995658993721008


 25%|██▌       | 5/20 [02:47<08:25, 33.68s/it]

max score sequence : CAATCGGAGTAATGAAATCA, deepbind score : 0.967224657535553


 30%|███       | 6/20 [03:21<07:52, 33.78s/it]

max score sequence : AATTAGAAGTACCCGGCTGA, deepbind score : 0.8647922277450562


 35%|███▌      | 7/20 [03:55<07:18, 33.73s/it]

max score sequence : GGGGCGTAGTCGCCAGGCGT, deepbind score : 0.7196805477142334


 40%|████      | 8/20 [04:27<06:40, 33.36s/it]

max score sequence : TGGGTAATTAATTCAGTTAA, deepbind score : 0.9984844326972961


 45%|████▌     | 9/20 [05:01<06:06, 33.33s/it]

max score sequence : TAATTAGGGGGGGAATAGGT, deepbind score : 0.8265032768249512


 50%|█████     | 10/20 [05:35<05:35, 33.53s/it]

max score sequence : CAATTAGCGCGAGTTCTGGT, deepbind score : 0.9020612835884094


 55%|█████▌    | 11/20 [06:08<05:02, 33.59s/it]

max score sequence : CCAGAAGTCACGCTCGCGGC, deepbind score : 0.730219841003418


 60%|██████    | 12/20 [06:42<04:29, 33.66s/it]

max score sequence : TGCCCGCACTGCACTTAGCT, deepbind score : 0.720783531665802


 65%|██████▌   | 13/20 [07:16<03:56, 33.76s/it]

max score sequence : GTCGTGTGATCGCATGAGGT, deepbind score : 0.6286212205886841


 70%|███████   | 14/20 [07:50<03:22, 33.75s/it]

max score sequence : CGTTACGTGATTCCACTCGA, deepbind score : 0.7992559671401978


 75%|███████▌  | 15/20 [08:24<02:48, 33.72s/it]

max score sequence : AATTACCGGATAGATGAAGT, deepbind score : 0.7661057114601135


 80%|████████  | 16/20 [08:58<02:15, 33.86s/it]

max score sequence : TACCCAGCTCGTCCAGGTCA, deepbind score : 0.6573156118392944


 85%|████████▌ | 17/20 [09:31<01:41, 33.76s/it]

max score sequence : GGGGTGCGCTGTGCGGGCGT, deepbind score : 0.8616964221000671


 90%|█████████ | 18/20 [10:04<01:06, 33.39s/it]

max score sequence : ACATTAATCGGGAATTAGTG, deepbind score : 0.9234344959259033


 95%|█████████▌| 19/20 [10:37<00:33, 33.37s/it]

max score sequence : GGCAAGGGTGTTGGTCTGAG, deepbind score : 0.6788474321365356


100%|██████████| 20/20 [11:10<00:00, 33.51s/it]

max score sequence : CAGGCACGGGCCCCTGGGAC, deepbind score : 0.7438989281654358





In [37]:
ordered_Scores = sorted(AdaScores, key=operator.itemgetter(1), reverse=True)


In [38]:
f = open("/content/AdaBoost_ScoreResult_Alx1.txt", 'w')
for seq, score in ordered_Scores:
  f.write(f'{seq} {score}\n')
f.close()

In [39]:
GradientScores = scoring(GradientBoost)

  5%|▌         | 1/20 [00:00<00:15,  1.23it/s]

max score sequence : TAATTAGTTAATTTAGTTGT, deepbind score : 0.9988328814506531


 10%|█         | 2/20 [00:01<00:14,  1.28it/s]

max score sequence : TTGATGGAATTAACTTAACC, deepbind score : 0.9830735921859741


 15%|█▌        | 3/20 [00:02<00:13,  1.23it/s]

max score sequence : AGTCAATTTAATTTCACTCA, deepbind score : 0.9850715398788452


 20%|██        | 4/20 [00:03<00:12,  1.27it/s]

max score sequence : TCCGCCGGTAATTCAGTTGT, deepbind score : 0.9967691898345947


 25%|██▌       | 5/20 [00:03<00:11,  1.25it/s]

max score sequence : CAATTAATTGAATTAAGGGT, deepbind score : 0.9999679327011108


 30%|███       | 6/20 [00:04<00:11,  1.25it/s]

max score sequence : TCGTTAATTGAATTAAGGGT, deepbind score : 0.9999635219573975


 35%|███▌      | 7/20 [00:05<00:10,  1.24it/s]

max score sequence : CGAGTAGTTAATTGAATTAA, deepbind score : 0.9997480511665344


 40%|████      | 8/20 [00:06<00:09,  1.22it/s]

max score sequence : AGCCCGAATTAGCTTAACCT, deepbind score : 0.9013445973396301


 45%|████▌     | 9/20 [00:07<00:08,  1.25it/s]

max score sequence : GTCACTGTAATTGGATTAGT, deepbind score : 0.9999624490737915


 50%|█████     | 10/20 [00:07<00:07,  1.27it/s]

max score sequence : CAATCACTTAATTCAATTGC, deepbind score : 0.9983460903167725


 55%|█████▌    | 11/20 [00:08<00:07,  1.24it/s]

max score sequence : TACTCAATTGAATTAAGGGT, deepbind score : 0.9948840737342834


 60%|██████    | 12/20 [00:09<00:06,  1.22it/s]

max score sequence : CCATCAAATTAATTGAGGGT, deepbind score : 0.7260397672653198


 65%|██████▌   | 13/20 [00:10<00:05,  1.22it/s]

max score sequence : TACTTAAATTAGGTTAAGGT, deepbind score : 0.9443314671516418


 70%|███████   | 14/20 [00:11<00:04,  1.25it/s]

max score sequence : TCGTTAATTAAGTTAATTGT, deepbind score : 0.9998806715011597


 75%|███████▌  | 15/20 [00:11<00:03,  1.27it/s]

max score sequence : TCAGGCCAATCGATTAACGT, deepbind score : 0.5872976183891296


 80%|████████  | 16/20 [00:12<00:03,  1.26it/s]

max score sequence : GCACCCAATTCAATTAACCA, deepbind score : 0.9989294409751892


 85%|████████▌ | 17/20 [00:13<00:02,  1.25it/s]

max score sequence : AATTAACTTAATTGAGTGGT, deepbind score : 0.9998165965080261


 90%|█████████ | 18/20 [00:14<00:01,  1.24it/s]

max score sequence : GTCATTAATTTAATTGACCT, deepbind score : 0.9978876709938049


 95%|█████████▌| 19/20 [00:15<00:00,  1.24it/s]

max score sequence : TACTGCAATTAGGTTAGCCT, deepbind score : 0.9877099990844727


100%|██████████| 20/20 [00:16<00:00,  1.24it/s]

max score sequence : TAATTAAATTAGATTAGGCC, deepbind score : 0.9999167919158936





In [40]:
ordered_Scores = sorted(GradientScores, key=operator.itemgetter(1), reverse=True)


In [41]:
f = open("/content/Gradient_ScoreResult_Alx1.txt", 'w')
for seq, score in ordered_Scores:
  f.write(f'{seq} {score}\n')
f.close()