In [2]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
!pip install -r /content/drive/My\ Drive/mya/scripts/requirements.txt

In [28]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import *
import seaborn as sns
import wandb 
import json

sns.set()
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

print(torch.__version__)

path = '/content/drive/My Drive/DeepGQuad/'
os.chdir(path)
os.listdir()

1.4.0


['DNA-transformer',
 'transformer-xl.ipynb',
 'Hybr',
 'Old BERT.ipynb',
 'scripts',
 'BERT.ipynb',
 'Data']

In [7]:
df = pd.read_csv('./Data/G4_chip_all_coords.csv', sep='\t')
df.head()

Unnamed: 0,chr,true_start,true_end,start,end,seq,len
0,chr1,713977,714310,713894,714394,TTCAGCCGGCAACACACAGAACCTGGCGGGGAGGTCACTCTTACCA...,500
1,chr1,762850,762982,762666,763166,ACCGGACACAGACGCAGATCTGGCAGCTGAGCGACAGGCTTCGGAG...,500
2,chr1,840076,840205,839891,840391,ACGTGGCCTCCTCCGAACGCGGCCGCCTCCTCCTCCGAACGCGGCC...,500
3,chr1,894635,894796,894466,894966,GGTCACGCAGGAGTCACAGCTGCCCGCACGCCCAGCTCGCCCCAGC...,500
4,chr1,935414,935699,935307,935807,CTGCCATCGGCGAGGCGCTCGGTTTCCCCGGCGTGTCTGCGGCCAT...,500


In [8]:
# Generate labels
max_seq_len = 500
labels_col = []
for i, row in df.iterrows():
    labels = np.zeros(max_seq_len)
    labels[row.true_start-row.start:row.true_end - row.start] = 1
    labels_col.append(labels)
df['label'] = labels_col
df.head(1)

Unnamed: 0,chr,true_start,true_end,start,end,seq,len,label
0,chr1,713977,714310,713894,714394,TTCAGCCGGCAACACACAGAACCTGGCGGGGAGGTCACTCTTACCA...,500,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [10]:
# Load shuffled data, drop sequences longer than 500, add labels
chromosomes = list(range(1,23)) + ['M','X','Y']
shuf = utils.merge_chrom_data('./Data/shuffled_chr', chromosomes)
shuf['label'] = [np.zeros(max_seq_len)] * shuf.shape[0]
shuf.head()

['chr1' 'chr2' 'chr3' 'chr4' 'chr5' 'chr6' 'chr7' 'chr8' 'chr9' 'chr10'
 'chr11' 'chr12' 'chr13' 'chr14' 'chr15' 'chr16' 'chr17' 'chr18' 'chr19'
 'chr20' 'chr21' 'chr22' 'chrM' 'chrX' 'chrY']


Unnamed: 0,chr,start,end,seq,len,label
0,chr1,113133816,113134316,tgctgggattacaggtgtgagccaccacacccggccTAGTATGTTA...,500,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,chr1,180867994,180868494,ctttaaaatgcctttggtcccaagtctctgacaggagggtgaggtg...,500,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,chr1,95276940,95277440,gggcaaagaacccgcatcagcctgcctcatcattaaccctcaactg...,500,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,chr1,40094551,40095051,gcgtgagccaccgcgcccggccCGCTGTTTTTTTCTAACATTTGCT...,500,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,chr1,152882574,152883074,CAGAAAACCCAGAGCAGCAGCTTAAGCAGGAGAAAACACAAAGGGA...,500,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [0]:
# Split to test & train
test_chromosomes_list = [3,6,9,12,15,18,21]
test_chromosomes = list(map(lambda n:'chr'+str(n),test_chromosomes_list))
specific_chromosomes = ['chrX','chrM','chrY']

test_data = df[df.chr.isin(test_chromosomes)].reset_index(drop=True)
train_data = df[~df.chr.isin(test_chromosomes + specific_chromosomes)].reset_index(drop=True)

In [0]:
#Shuffle
train_full = train_data.sample(frac=1).reset_index(drop=True)
test_full = test_data.sample(frac=1).reset_index(drop=True)

In [16]:
# Get validation set
val_part = int(0.7*test_full.shape[0])
print('Validation:', val_part)
val_full = test_full[:val_part].sample(frac=1).reset_index(drop=True)
print('Test:', test_full.shape[0]-val_part)
test_full = test_full[val_part:].sample(frac=1).reset_index(drop=True)

Validation: 1477
Test: 634


In [0]:
#test_full = test_full.sample(n=30).reset_index(drop=True)

In [0]:
def save_labeled_sequence_gquad(data, file_name):
  part_len = 500
  seq_labeled = []
  for i, row in data.iterrows():
    part = {"primary": row.seq.upper(), "gquad": row.label.tolist()}
    seq_labeled.append(part)
    prev_i = i

  print("parts_num", len(seq_labeled))
  with open(f"/content/drive/My Drive/DeepGQuad/Data/{file_name}.json","w") as fout:
      json.dump(seq_labeled,fout)

In [23]:
# Save sequences labeled with G-quadruplexes
save_labeled_sequence_gquad(test_full, 'readydata_test')
save_labeled_sequence_gquad(train_full, 'readydata_train')
save_labeled_sequence_gquad(val_full, 'readydata_valid')

parts_num 634


In [11]:
sequences = list(SeqIO.parse('./Data/chr1.fa', "fasta"))

In [12]:
columns = ['chr','start','end','seq']
df = pd.read_csv(f'./Data/chr1.bed',sep='\t',header=None)
df.columns = columns
df.head(5)

Unnamed: 0,chr,start,end,seq
0,chr1,713977,714310,ACCATGGCGCCCCAGTGATGTAGCCGAACACCCGCGCCTCTAACGT...
1,chr1,762850,762982,GGAGGGCACTCACCCGAGCGGACCTTGGCTCCGGATAATCCGTTTC...
2,chr1,840076,840205,gccgcctccgaacgtggccgccgcctcctccgaacgtggccgcttc...
3,chr1,894635,894796,CGTGCACCCCACTTCCGGCCCCAGAATGCCGCGCGGCTGCGCACTT...
4,chr1,935414,935699,GCGGGCGAGCGGCGAGCGCGCGGCGATCCGAGCCCCTAGGGCGGAT...


In [13]:
df[df['start'] > 2030000]

Unnamed: 0,chr,start,end,seq
47,chr1,2064636,2065136,CCAGCTTCGACCCCCAGCTGTGCGTCAGTCCCTCAGCTCCGCCCCC...
48,chr1,2111846,2112000,TGCACGGACGACGTAGACACACGGATGACTCATCCACAGATGACTc...
49,chr1,2120954,2121332,AAACCAAACGTCAGAAAGAAAAGCGGCAGACGTTTCATCACAACAC...
50,chr1,2121351,2121366,GGCCACCCCAGGGGC
51,chr1,2126093,2126378,ccggccgcgcccccgcccggctcccggcctcacccgcccgccgggc...
...,...,...,...,...
1064,chr1,248020879,248021031,AGGCGGCGCTGTGCTGGGTGTGCGACGCCGGCCCCGAGCACAGGAC...
1065,chr1,249132199,249132534,GCTGGAAGACTAGACGGTGGACCGCCTAGCTCCCGGGACTCCGCGG...
1066,chr1,249153182,249153428,GGCCCGGAGCTGCTGGAAGACAGGGGCCCACCTCGCGCGCGCAGCG...
1067,chr1,249200233,249200467,GGAGGGTAAGGGAGAGTGGCGGGAAGAGCTGGCCCGCTGGGGTGGG...


In [14]:
# Save sequences divided in parts
def save_record(start, end, file_name):
  part_len = 500
  print(f'{start}-{end}')
  seq_len = end - start
  prev_i = 0
  seqs = []
  for i in range(500,seq_len+500,500):
    #print(i)
    part = sequences[0].seq[prev_i:i].upper()
    part = SeqRecord(part, str(i),'','')
    seqs.append(part)
    prev_i = i

  SeqIO.write(seqs, f"./Data/{file_name}.fa", "fasta")

seq_len = 700000 #len(sequences[0].seq) - 20000
fractions = [0.1,0.2]
idxs=[2030000]
for frac in fractions:
    new_idx = idxs[-1]+int(seq_len*frac)
    idxs.append(new_idx)

save_record(idxs[0], idxs[1], 'chr1_valid')
save_record(idxs[1], idxs[2], 'chr1_test')
save_record(idxs[-1], idxs[0] + seq_len, 'chr1_train')

2030000-2100000
2100000-2240000
2240000-2730000


In [0]:
#os.chdir('/content/drive/My Drive/DeepGQuad/Data')

In [6]:
!wandb login

In [0]:
#os.environ['WANDB_PROJECT'] = 'gquad'
os.environ['WANDB_PROJECT'] = 'gquad_labeled'

In [0]:
#os.chdir('/content/drive/My Drive/DeepGQuad/results')

In [0]:
#!git clone https://github.com/NVIDIA/apex
#%cd apex
#!pip install -e "apex" --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext"
#!pip install -v --no-cache-dir ./
#os.chdir('..')

In [0]:
# Train
!python /content/drive/My\ Drive/DeepGQuad/scripts/train_bert.py transformer g_quadruplex --data_dir '/content/drive/My Drive/DeepGQuad/Data' --tokenizer dna  --num_train_epochs 30 --batch_size 32 --learning_rate 1e-5 --patience  2  --save_freq 1 --eval_freq 1 --warmup_steps 1000 --gradient_accumulation_steps 4 --from_pretrained /content/drive/My\ Drive/DeepGQuad/results/g_quadruplex_transformer_20-05-30-15-06-50_711714 --resume_from_checkpoint

20/05/30 15:25:35 - INFO - tape.models.modeling_utils -   loading configuration file /content/drive/My Drive/mya/tape/results/g_quadruplex_transformer_20-05-30-15-06-50_711714/config.json
20/05/30 15:25:35 - INFO - tape.models.modeling_utils -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 8096,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 30
}

20/05/30 15:25:35 - INFO - tape.models.modeling_utils -   loading weights file /content/drive/My Drive/mya/tape/results/g_quadruplex_transformer_20-05-30-15-06-50_711714/pytorch_model.bin
[34m[1mwandb[0m: Tracking run with wandb version 0.8.36
[34m[1mwandb[0m: Ru

In [7]:
#results/dna_masked_language_modeling_transformer_20-05-20-18-58-27_610560  results/g_quadruplex_transformer_20-05-20-19-17-20_872492

In [0]:
torch.cuda.empty_cache()

In [34]:
!nvidia-smi

Mon Jun  1 15:00:03 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8     9W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

# Evaluation


In [0]:
# transformer
# accuracy: 0.205 40 epochs  
#0.462 80 epochs 
#0.436 99 epochs
# resnet
# 0.304 80 epochs

In [8]:
#/content/drive/My Drive/DeepGQuad/results/g_quadruplex_transformer_20-05-21-14-00-49_080144

##### best

* b12 gr_step 1 
accuracy: 0.6998666666666666iou: 50.92129256577632 recall: 0.3935121817684095 false_neg: 4431

* b16 gr_step 2 
accuracy: 0.7130666666666666 iou: 53.8418987949087 recall: 0.9753627155762387 false_neg: 180

* g_quadruplex_transformer_20-05-30-14-21-32_585376
b32 gr_step 4 lr 1e-5
accuracy: 0.815 iou: 68.4089288320499 recall: 0.7054475773336983f alse_neg: 2152

* 20-05-30-15-06-50_711714
pretrained
accuracy: 0.8169333333333333iou: 68.87577593376315 recall: 0.7471940870517383 false_neg: 1847


In [0]:
!python /content/drive/My\ Drive/DeepGQuad/scripts/eval_bert.py transformer g_quadruplex /content/drive/My\ Drive/DeepGQuad/results/g_quadruplex_transformer_20-05-30-14-21-32_585376 --metrics accuracy iou recall false_neg --tokenizer dna --data_dir '/content/drive/My Drive/Data'

In [0]:
#!python eval_bert.py transformer dna_masked_language_modeling results/dna_masked_language_modeling_transformer_20-05-27-18-58-27_610560 --metrics accuracy  --tokenizer dna --data_dir './Data'

In [0]:
!python /content/drive/My\ Drive/DeepGQuad/scripts/eval_bert.py transformer g_quadruplex /content/drive/My\ Drive/mya/tape/results/g_quadruplex_transformer_20-05-27-19-17-20_872492 --metrics accuracy iou recall false_neg --tokenizer dna --data_dir '/content/drive/My Drive/mya/DNA-transformer/data'

20/05/30 13:14:08 - INFO - tape.training -   device: cpu n_gpu: 1
20/05/30 13:14:08 - INFO - tape.models.modeling_utils -   loading configuration file /content/drive/My Drive/mya/tape/results/g_quadruplex_transformer_20-05-27-19-17-20_872492/config.json
20/05/30 13:14:08 - INFO - tape.models.modeling_utils -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 8096,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 30
}

20/05/30 13:14:08 - INFO - tape.models.modeling_utils -   loading weights file /content/drive/My Drive/mya/tape/results/g_quadruplex_transformer_20-05-27-19-17-20_872492/pytorch_model.bin
Evaluation: 100