# This notebook uses all the script to train a CAE for DNase

In [29]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import json
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# The base directory is one level up
base = '..'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load settings

In [60]:
with open("../settings-grch38-dnase-12kb-testing.json", "r") as f:
    settings = json.load(f)

with open("../datasets-grch38-dnase-testing.json", "r") as f:
    datasets = json.load(f)
    dataset_ids = list(datasets.keys())

with open("../cnn-search.json", "r") as f:
    search = json.load(f)

### 1. Find DNase datasets on ENCODE

In [15]:
from find import find

find(settings, base=base, clear=False, verbose=True)

File already exists. Use `--clear` to overwrite it.


### 2. Download the datasets

In [39]:
from download import download

download(datasets, settings, base=base, clear=False, verbose=True)

HBox(children=(IntProgress(value=0, description='Dataset', max=2, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Sample', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='Data type', max=3, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='Sample', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='Data type', max=3, style=ProgressStyle(description_width='ini…

### 3. Prepare the downloaded data

In [67]:
from prepare import prepare

prepare('DNase', datasets, settings, base=base, clear=True, verbose=False)

HBox(children=(IntProgress(value=0, description='Datasets', max=2, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Samples', max=1, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Samples', max=1, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

### 4. Create jobs for training various CAE models

In [68]:
from jobs import jobs

jobs(
    search,
    settings, 
    epochs=25,
    batch_size=32,
    peak_weight=1,
    base=base,
    clear=True,
    verbose=False
)

HBox(children=(IntProgress(value=0, description='Jobs', max=218, style=ProgressStyle(description_width='initia…

Created slurm files for training 218 neural networks


### 5. Train CAEs

In [70]:
from train import train

# Load the CAE definition, training settings, datasets, and define training parameters
with open("../slurm/cf-128--ck-11--du-512-128--do-0-0-0--e-10--rl-0--o-adam--lr-0.01--lrd-0.0--l-bce--m-.json", "r") as f:
    definition = json.load(f)

epochs = 2
batch_size = 32
peak_weight = 1 # Is not used right now

# Start the training
train(
    definition,
    settings,
    dataset_ids,
    epochs=epochs,
    batch_size=batch_size,
    peak_weight=peak_weight,
    base=base,
    clear=False
)

Encoder/decoder already exists. Use `--clear` to overwrite it.


### 6. Evaluate CAEs