In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
def metrics(y_test, pred):
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    print('정확도 : {0:.2f}, 정밀도 : {1:.2f}, 재현율 : {2:.2f}'.format(accuracy, precision, recall))
    print('f1-score : {0:.2f}'.format(f1))

In [3]:
from collections import defaultdict
import gzip
import numpy as np
from tqdm import tqdm

# one-hot encoder

def encode_one_hot(train_set, classes="AGCT"):
    encoder = defaultdict(lambda: np.array([0]*len(classes)))
        
    for i, _class in enumerate(classes):
        tmp = np.zeros(len(classes))
        tmp[i] = 1
        encoder[_class] = tmp
        
    output = []
    for record in tqdm(train_set):
        encoded_record = []
        
        for c in record.upper():
            encoded_record.append(encoder[c])

        encoded_record = np.ravel(encoded_record)    
        output.append(encoded_record)
        
    output = np.array(output)
    
    return output

In [4]:
# Parsing
import gzip
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# single dataset version
def get_dataset(path):
    dataset =  gzip.open(path, "r")
    output = []    
    for i, record in tqdm(enumerate(dataset)):
        record = record.decode()
        record = record[:-1]
        record = record.split('\t')
        if i != 0:
            output.append(record)
    output = np.array(output).reshape([-1, 4])
    
    return encode_one_hot(output[:, 2]), np.array([int(i) for i in output[:, 3]]).flatten()

x1, y1 = get_dataset("/content/drive/Shareddrives/GP/Aptamer/data/DeepBind/Alx1_DBD_TAAAGC20NCG_3_Z_A.seq.gz")
x2, y2 = get_dataset("/content/drive/Shareddrives/GP/Aptamer/data/DeepBind/Alx1_DBD_TAAAGC20NCG_3_Z_B.seq.gz")
x = np.append(x1, x2, axis=0)
y = np.append(y1, y2, axis=0)

# 데이터셋 이름 변경 (valid -> test), validation을 진행 안하네?
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state=112)



print(train_x.shape)
print(train_y.shape)

print(test_x.shape)
print(test_y.shape)

del(y)

128013it [00:00, 162810.85it/s]
100%|██████████| 128012/128012 [00:02<00:00, 63374.34it/s]
255509it [00:01, 189335.72it/s]
100%|██████████| 255508/255508 [00:03<00:00, 65338.73it/s]


(306816, 80)
(306816,)
(76704, 80)
(76704,)


In [5]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
import pandas as pd
import numpy as np

#train_x.flatten()
#valid_x.flatten()

# 변수이름변경에 따른 변경 (valid -> test)
train_data = lgb.Dataset(train_x, label=train_y)
test_data = lgb.Dataset(test_x, label=test_y)

In [6]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report, roc_auc_score

#train
parameters = {
    'learning_rate': 0.05,
    'n_estimator': 200,
    'max_depth': 16,
    'boosting': 'dart',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'is_training_metric': True,
    'num_leaves': 144,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
}

model = lgb.train(parameters, train_data, 1000, test_data, verbose_eval=100) 
# 변수이름바꿈
y = model.predict(test_x)

[100]	valid_0's binary_logloss: 0.392816
[200]	valid_0's binary_logloss: 0.341382
[300]	valid_0's binary_logloss: 0.301998
[400]	valid_0's binary_logloss: 0.276498
[500]	valid_0's binary_logloss: 0.258536
[600]	valid_0's binary_logloss: 0.253532
[700]	valid_0's binary_logloss: 0.244397
[800]	valid_0's binary_logloss: 0.240951
[900]	valid_0's binary_logloss: 0.235618
[1000]	valid_0's binary_logloss: 0.233091


In [7]:
for i in range(len(y)):
  if y[i] > 0.5:
    y[i] = 1
  else:
    y[i] = 0

score = metrics.roc_auc_score(test_y, y)
print(score) # 0.9099916562369629
score1 = classification_report(test_y, y, digits=5)  #모델 검사
print(score1)

0.8985362086287758
              precision    recall  f1-score   support

           0    0.86532   0.86423   0.86478     25544
           1    0.93226   0.93284   0.93255     51160

    accuracy                        0.90999     76704
   macro avg    0.89879   0.89854   0.89866     76704
weighted avg    0.90996   0.90999   0.90998     76704



In [8]:
#onehot_decoder
def onehot_decoder(seq, word=['A','G','C','T']):
    out = ""
    
    for i in range(9, 29):
        arg = np.argmax(seq[i])
        out += word[arg]
    #out+="/"
        
    return out

In [23]:
#Genetic Algorithm
 
#Initialize
def init():
  pool = []
  for i in range(30):   #유전자풀은 30개, 랜덤으로 초기화함
    seq = []
    for i in range(9):
        seq.append([.25, .25, .25, .25])
    for i in range(20):
        acid = [0, 0, 0, 0]
        n = np.random.randint(0,4)
        acid[n] = 1
        seq.append(acid)
    for i in range(9):
        seq.append([.25, .25, .25, .25])
    pool.append(seq)
  return np.array(pool)
 
#fitness
def fitness(model, pool):
  scores = [0]*30
  for i in range(30):
    proba = model.predict([pool[i][9:29].ravel()])
    scores[i] = proba
  
  return scores
 
#selection
def selection(pool, scores):
  temp = []
  while len(temp) < 8:  #적합도가 가장 높은 8개의 유전자를 선정
    index = np.argmax(scores)
    if pool[index] not in temp:
      temp.append(pool[index])
    del scores[index]
    del pool[index]
 
  p1 = temp[0][:] #4개의 부모 유전자
  p2 = temp[1][:]
  p3 = temp[2][:]
  p4 = temp[3][:]
  x = np.random.randint(9,29) #교차 위치 선정
  c1 = p1[:x] + p2[x:] #4개의 자식 유전자 생성
  c2 = p2[:x] + p1[x:]
  x = np.random.randint(9,29)
  c3 = p3[:x] + p4[x:]
  c4 = p4[:x] + p3[x:]
  temp.append(c1)
  temp.append(c2)
  temp.append(c3)
  temp.append(c4)
 
  n = np.random.randint(10,16)  #부모 및 자식의 돌연변이 갯수 선정
  for i in range(n):
    index = np.random.randint(0,12)  #오리지날 부모 및 자식 중 하나 선택
    temp_seq = temp[index][:]
    pos_a = np.random.randint(9,29) #돌연변이 시작위치 선정
    pos_b = np.random.randint(pos_a,29) #돌연변이 끝 위치 선정
    for j in range(pos_a, pos_b+1): #pos_a 부터 pos_b까지의 모든 아미노산을 돌연변이 시킴
      temp_acid = [0, 0, 0, 0]
      r = np.random.randint(0,4)  #돌연변이 값 (A,G,T,C 중 하나) 선정
      temp_acid[r] = 1
      temp_seq[j] = temp_acid[:]
 
    temp.append(temp_seq)
 
  for i in range(18-n): #부모와 자식, 돌연변이 외 나머지는 완전 무작위로 sequence를 생성
    seq = []
    for i in range(9):
        seq.append([.25, .25, .25, .25])
    for i in range(20):
        acid = [0, 0, 0, 0]
        n = np.random.randint(0,4)
        acid[n] = 1
        seq.append(acid)
    for i in range(9):
        seq.append([.25, .25, .25, .25])

    temp.append(seq)
 
  return np.array(temp)

In [24]:
import tensorflow as tf
import operator
# load model
deepbind = tf.keras.models.load_model("/content/CNN_Final_Alx1.h5")

In [25]:
def scoring(model):
  deepScores = []
  for i in tqdm(range(20)):
    pool = init()
    for j in range(50):
      scores = fitness(model, pool)
      pool = selection(pool.tolist(), scores)[:]
    
    scores = fitness(model, pool)
    index = np.argmax(scores)
    most_seq = pool[index]
    decode_seq = onehot_decoder(most_seq)
    deep_score = deepbind.predict(most_seq.reshape(1,38,4))[0][0]
    print(f'max score sequence : {decode_seq}, deepbind score : {deep_score}')
    deepScores.append([decode_seq, deep_score])

  return deepScores

In [26]:
deepScores = scoring(model)

  5%|▌         | 1/20 [00:00<00:16,  1.15it/s]

max score sequence : CGGCGGTAATCTAATTAGCA, deepbind score : 0.9999942183494568


 10%|█         | 2/20 [00:01<00:14,  1.23it/s]

max score sequence : CGCCACGGCTAATCTAATTA, deepbind score : 0.9999815225601196


 15%|█▌        | 3/20 [00:02<00:13,  1.22it/s]

max score sequence : TGTTAATTAGATTAGGGCGC, deepbind score : 0.9999310970306396


 20%|██        | 4/20 [00:03<00:13,  1.23it/s]

max score sequence : AAATCGAATTACGGGGGCCT, deepbind score : 0.9982835054397583


 25%|██▌       | 5/20 [00:04<00:11,  1.26it/s]

max score sequence : GTGGGGCCCGCCGCGGCCCA, deepbind score : 0.7794436812400818


 30%|███       | 6/20 [00:04<00:11,  1.26it/s]

max score sequence : ACTCTAATTAACTTAATTAA, deepbind score : 0.9999644160270691


 35%|███▌      | 7/20 [00:05<00:10,  1.28it/s]

max score sequence : CGCCACCCGGCTCGCGGGGT, deepbind score : 0.7933926582336426


 40%|████      | 8/20 [00:06<00:09,  1.27it/s]

max score sequence : TAATTAAATAATTTAATTAA, deepbind score : 0.9997058510780334


 45%|████▌     | 9/20 [00:07<00:08,  1.27it/s]

max score sequence : ACCATAATTAGATTAAGCGT, deepbind score : 0.9999260902404785


 50%|█████     | 10/20 [00:07<00:07,  1.27it/s]

max score sequence : TGATTTAATTAGATTAGCGC, deepbind score : 0.9999479055404663


 55%|█████▌    | 11/20 [00:08<00:07,  1.27it/s]

max score sequence : AGGCCGTAATTCGATTAGCT, deepbind score : 0.9998949766159058


 60%|██████    | 12/20 [00:09<00:06,  1.28it/s]

max score sequence : AGACTGGGGTGAGCCGCCCT, deepbind score : 0.7000094056129456


 65%|██████▌   | 13/20 [00:10<00:05,  1.27it/s]

max score sequence : CTGGTCGCTCCCTCGCGCGA, deepbind score : 0.7683765888214111


 70%|███████   | 14/20 [00:11<00:04,  1.26it/s]

max score sequence : CGCGGTTAATTCAATTAGCA, deepbind score : 0.99994957447052


 75%|███████▌  | 15/20 [00:11<00:03,  1.28it/s]

max score sequence : TGCCCGCGAGCCGGGGCCCT, deepbind score : 0.770804762840271


 80%|████████  | 16/20 [00:12<00:03,  1.27it/s]

max score sequence : TGGGGGTAATCCAATTAACT, deepbind score : 0.9999834299087524


 85%|████████▌ | 17/20 [00:13<00:02,  1.27it/s]

max score sequence : AGGCCGGGTAATTCAATTAG, deepbind score : 0.9999028444290161


 90%|█████████ | 18/20 [00:14<00:01,  1.20it/s]

max score sequence : AATTAGGGGGGCGCCGCGCT, deepbind score : 0.9708648920059204


 95%|█████████▌| 19/20 [00:15<00:00,  1.22it/s]

max score sequence : AATTATTGGATTAGCCGGCT, deepbind score : 0.9980596303939819


100%|██████████| 20/20 [00:15<00:00,  1.25it/s]

max score sequence : TGAGCGGGGGGGGCCGCCCT, deepbind score : 0.7811452150344849





In [27]:
import operator
ordered_Scores = sorted(deepScores, key=operator.itemgetter(1), reverse=True)
print(ordered_Scores)


[['CGGCGGTAATCTAATTAGCA', 0.9999942], ['TGGGGGTAATCCAATTAACT', 0.99998343], ['CGCCACGGCTAATCTAATTA', 0.9999815], ['ACTCTAATTAACTTAATTAA', 0.9999644], ['CGCGGTTAATTCAATTAGCA', 0.9999496], ['TGATTTAATTAGATTAGCGC', 0.9999479], ['TGTTAATTAGATTAGGGCGC', 0.9999311], ['ACCATAATTAGATTAAGCGT', 0.9999261], ['AGGCCGGGTAATTCAATTAG', 0.99990284], ['AGGCCGTAATTCGATTAGCT', 0.999895], ['TAATTAAATAATTTAATTAA', 0.99970585], ['AAATCGAATTACGGGGGCCT', 0.9982835], ['AATTATTGGATTAGCCGGCT', 0.99805963], ['AATTAGGGGGGCGCCGCGCT', 0.9708649], ['CGCCACCCGGCTCGCGGGGT', 0.79339266], ['TGAGCGGGGGGGGCCGCCCT', 0.7811452], ['GTGGGGCCCGCCGCGGCCCA', 0.7794437], ['TGCCCGCGAGCCGGGGCCCT', 0.77080476], ['CTGGTCGCTCCCTCGCGCGA', 0.7683766], ['AGACTGGGGTGAGCCGCCCT', 0.7000094]]


In [29]:
f = open("/content/LGBM_ScoreResult_Alx1.txt", 'w')
for seq, score in ordered_Scores:
  f.write(f'{seq} {score}\n')
f.close()