# Comparison of original CNN and new PyTorch CNN
  PyTorch CNN trains with CosAnnealingWithWarmRestarts scheduler 
  The experiment plot: 
  <pre>
                                   ┌> PT CNN ─Training─> Trained PT CNN ─────┬────> Stats 
                                   |                                         |
  DATASET (70 seqs of 10 families) |                    TEST DATASET (67 seqs of 10 families)                 
                                   |                                         |
                                   └> Ch CNN ─Training─> Trained Ch CNN ─────┴────> Stats
  </pre>

In [4]:
import sys
import os
# appending source path 
sys.path.append(os.sep.join(os.getcwd().split(os.sep)[:-1] + ["src"]))
sys.path.append(os.sep.join(os.getcwd().split(os.sep)[:-1] + ["src", "original_cnn"]))

In [5]:
import RunNN  # our cnn
from DataProcessing import AlignmentFilePrepare  # dataset class

from torch.utils.data import DataLoader

import json
from subprocess import Popen, PIPE


# IMPORTANT CONSTANTS
BATCH_SIZE= 128
EPOCH = 25
CPU_ONLY = False

## Prepare datasets
Please, stand by

In [3]:
# training ds
cmd = [
    "python",
    "../src/original_cnn/prepareData.py",
    "-i", "../data/train.fasta",
    "-o", "../data/train",
    "-t", "3"  # adjust threads
]
with Popen(cmd) as proc:
    print("Return code is {}".format(proc.wait()))

# test ds
cmd = [
    "python",
    "../src/original_cnn/prepareData.py",
    "-i", "../data/test.fasta",
    "-o", "../data/test",
    "-t", "3"  # adjust threads
]
with Popen(cmd) as proc:
    print("Return code is {}".format(proc.wait()))
    

Return code is 0
Return code is 0


## Train and test RunNN
### DAFS with alignments

In [4]:
# train model
model = RunNN.main(
    dataset="../data/train/ncRNApair_data.npy",
    label="../data/train/ncRNApair_labe.npy",
    genelabel="../data/train/genelabel.txt",
    batchsize=BATCH_SIZE,
    epoch=EPOCH,
    vpart=0.1,
    cpu=CPU_ONLY,
    structure=False,
    predictor=""  # empty, if you want to train model
)
# evaluate model
dataset = AlignmentFilePrepare("../data/test/ncRNApair_data.npy", "../data/test/ncRNApair_labe.npy", "../data/test/genelabel.txt")
dl = DataLoader(dataset, num_workers=3, shuffle=True, batch_size=BATCH_SIZE)
RunNN.predict(model, dl)
# save stats
with open("stats.json") as fin:
    dafs_pt_cnn_history = json.load(fin)
dafs_pt_cnn_results = RunNN.predict(model, dl)
# clean space
del model
dafs_pt_cnn_results

Dataset splitted. Classes distributed: chi^2 p-val=1.0 (1 means equally distributed classes in test and validation)
Data loaded:
	Train: 4346 alignments
	Validation: 484 alignments
Negatives in training set: 91.26%. In val set: 91.74%
Data is loaded. Shape: torch.Size([1, 1200, 16])
Epoch [1/25], Loss (train/val): 0.1675/0.0378, Accuracy (train/val): 93.0607%/99.5868%, Precision (val): 0.9955, Current lr: 0.00050. Time elapsed: 6.15
Epoch [2/25], Loss (train/val): 0.0207/0.0080, Accuracy (train/val): 99.2647%/99.5868%, Precision (val): 1.0000, Current lr: 0.00050. Time elapsed: 9.51
Epoch [3/25], Loss (train/val): 0.0012/0.0004, Accuracy (train/val): 99.9885%/100.0000%, Precision (val): 1.0000, Current lr: 0.00050. Time elapsed: 12.68
Epoch [4/25], Loss (train/val): 0.0003/0.0001, Accuracy (train/val): 100.0000%/100.0000%, Precision (val): 1.0000, Current lr: 0.00050. Time elapsed: 15.87
Epoch [5/25], Loss (train/val): 0.0001/0.0001, Accuracy (train/val): 100.0000%/100.0000%, Precision

### DAFS structure-only

In [5]:
# train model
model = RunNN.main(
    dataset="../data/train/ncRNApair_data.npy",
    label="../data/train/ncRNApair_labe.npy",
    genelabel="../data/train/genelabel.txt",
    batchsize=BATCH_SIZE,
    epoch=EPOCH,
    vpart=0.1,
    cpu=CPU_ONLY,
    structure=True,
    predictor=""  # empty, if you want to train model
)
# evaluate model
dataset = AlignmentFilePrepare("../data/test/ncRNApair_data.npy", "../data/test/ncRNApair_labe.npy", "../data/test/genelabel.txt", structure=True)
dl = DataLoader(dataset, num_workers=3, shuffle=True, batch_size=BATCH_SIZE)
RunNN.predict(model, dl)
# save stats
with open("stats.json") as fin:
    struct_pt_cnn_history = json.load(fin)
struct_pt_cnn_results = RunNN.predict(model, dl)
struct_pt_cnn_results
del model

Dataset splitted. Classes distributed: chi^2 p-val=1.0 (1 means equally distributed classes in test and validation)
Data loaded:
	Train: 4346 alignments
	Validation: 484 alignments
Negatives in training set: 91.26%. In val set: 91.74%
Data is loaded. Shape: torch.Size([1, 1200, 6])
Epoch [1/25], Loss (train/val): 0.1635/0.0839, Accuracy (train/val): 93.3134%/96.6942%, Precision (val): 0.9712, Current lr: 0.00050. Time elapsed: 2.68
Epoch [2/25], Loss (train/val): 0.0463/0.0238, Accuracy (train/val): 98.5524%/99.7934%, Precision (val): 0.9977, Current lr: 0.00050. Time elapsed: 5.36
Epoch [3/25], Loss (train/val): 0.0045/0.0111, Accuracy (train/val): 99.9311%/99.7934%, Precision (val): 0.9977, Current lr: 0.00050. Time elapsed: 8.01
Epoch [4/25], Loss (train/val): 0.0006/0.0051, Accuracy (train/val): 100.0000%/100.0000%, Precision (val): 1.0000, Current lr: 0.00050. Time elapsed: 10.65
Epoch [5/25], Loss (train/val): 0.0003/0.0053, Accuracy (train/val): 100.0000%/99.7934%, Precision (va

In [6]:
strcut_pt_cnn_results

{'TP': 1860,
 'FP': 31,
 'FN': 31,
 'TN': 1718,
 'ACC': 0.9829670329670329,
 'PREC': 0.9836065573770492,
 'RECALL': 0.9836065573770492,
 'F1': 0.9836065573770492}

In [7]:
dafs_pt_cnn_results

{'TP': 1874,
 'FP': 17,
 'FN': 17,
 'TN': 1723,
 'ACC': 0.9906361883778574,
 'PREC': 0.9910100475938657,
 'RECALL': 0.9910100475938657,
 'F1': 0.9910100475938657}

## Train and test original CNN
### Train

In [3]:
# run train
# please, stand by
cmd = [
    "python",
    "../src/original_cnn/RNApairClassify.py",
    "-d", "../data/train/ncRNApair_data.npy",
    "-l", "../data/train/ncRNApair_labe.npy",
    "-gl", "../data/train/genelabel.txt",
    "-b", str(BATCH_SIZE),
    "-e", str(EPOCH),
    "-v", "6",  # the max value, which does not fail
    "-g", "0",  # adjust the GPU id
    "-o", "../data/results_train",
]
with Popen(cmd, stdout=PIPE, stderr=PIPE) as proc:
    print("Return code is {}".format(proc.wait()))
    print("Errors: {}".format("".join(proc.communicate()[1].decode("utf-8"))))

Return code is 0
Errors: 


### Test

In [9]:
cmd = [
    "python",
    "../src/original_cnn/RNApairClassify.py",
    "-d", "../data/test/ncRNApair_data.npy",
    "-l", "../data/test/ncRNApair_labe.npy",
    "-gl", "../data/test/genelabel.txt",
    "-g", "0",  # adjust the GPU id
    "-v", "1",  # use all data for test
    "-p", "../data/results_train/validation5/model_epoch-{}".format(EPOCH),
    "-o", "../data/results_test",
]
with Popen(cmd, stdout=PIPE, stderr=PIPE) as proc:
    print("Return code is {}".format(proc.wait()))
    print("Errors: {}".format("".join(proc.communicate()[1].decode("utf-8"))))

Return code is 0
Errors: 
