In [67]:
import os
import re
from shutil import copyfile
import h5py as h5
import numpy as np
import pandas as pd

In [68]:
data_path = '/scratch/sdodl001/ENCODE/sampled_data/GM12878_2/test_input_ss/'
files = os.listdir(data_path)

src_test_data_path = '/scratch/sdodl001/ENCODE/sampled_data/GM12878_2/dna_GM12878_1_ss_input/'
test_data_path = '/scratch/sdodl001/ENCODE/sampled_data/GM12878_2/'


sh_file_path = '/scratch/sdodl001/ENCODE/scripts/GM12878_ss_eval_job_files'

model_path = '/scratch/sdodl001/ENCODE/models/GM12878/'
folders = os.listdir(model_path)
folders.sort()

out_path = '/scratch/sdodl001/ENCODE/eval/GM12878/'

In [69]:
len(files)

260

In [70]:
def move_files(src_path, dest_path):
    files = os.listdir(src_path)
    for file in files:
        copyfile(src_path+file, dest_path+file)
    print(f'files moved from {src_path} to {dest_path}')

In [71]:
def make_dir(dir_path):
    if not os.path.exists(dir_path):
        print(f'Created {dir_path} directory')
        os.makedirs(dir_path)

In [72]:

def regex_match(start_with, end_with, folder):
    m = re.search(start_with+end_with, folder)
    if m!=None:
        return True
    else:
        return False


In [75]:
def create_testset(data_path, src_test_data_path):
    files = os.listdir(data_path)
    make_dir(src_test_data_path)
    chroms = ['c2', 'c4', 'c6', 'c8', 'c10', 'c12']
    for file in files:
        start = file[:3]
        if start[2] == '_':
            start = start[:2]
        if start in chroms:
            #copyfile(data_path+file, src_test_data_path+file)
            f_in = h5.File(data_path+file, 'r')
            chromo = f_in['chromo']
            chromo = chromo[:]
            dist = f_in['inputs/cpg/GM12878_WGBS_1_sample_0/dist']
            state = f_in['inputs/cpg/GM12878_WGBS_1_sample_0/state']
            dna = f_in['inputs/dna']
            outputs = f_in['outputs/cpg/GM12878_WGBS_1_sample_0']
            pos = f_in['pos']
            pos = pos[:]

            f_out = h5.File(src_test_data_path+file, "a")                
            grp1 = f_out.create_group('inputs')
            grp2 = f_out.create_group('outputs')


            f_out['chromo'] = chromo
            grp1['cpg/GM12878_WGBS_1_ss/dist'] = dist[:]
            grp1['cpg/GM12878_WGBS_1_ss/state'] = state[:]
            grp1['dna'] = dna[:]
            grp2['cpg/GM12878_WGBS_1_ss'] = outputs[:]
            f_out['pos'] = pos

            f_out.close()
    print('test dataset created and saved to: ', src_test_data_path)

In [76]:
create_testset(data_path, src_test_data_path)

test dataset created and saved to:  /scratch/sdodl001/ENCODE/sampled_data/GM12878_2/dna_GM12878_1_ss_input/


In [47]:
def sh_file_generator_eval(sh_file_path, model_path, test_data_path, src_test_data_path, out_path):
    textList = ["#!/bin/bash -l", 
                    "#SBATCH --ntasks=1",
                    "#SBATCH -o ./err_log/eval_%j.out",
                    "#SBATCH -e ./err_log/eval_%j.err"]

    make_dir(sh_file_path)
    folders = os.listdir(model_path)
    folders.sort()
    for folder in folders:
        if regex_match('^dna.', '*_ss$', folder):
            #print('input_folder: ', folder)
            sh_file_name = folder+'_eval.sh'
            outF = open(sh_file_path+'/'+sh_file_name, "w")
            for line in textList:
                # write line to output file
                outF.write(line)
                outF.write("\n")

            outF.write("\n")
            outF.write("\n")
            outF.write('python -u dcpg_eval.py ')

            test_input_path = test_data_path+folder+'_input/'
            make_dir(test_input_path)
            move_files(src_test_data_path, test_input_path)
            outF.write(test_input_path+'/c{2,4,6,8,10,12}_*.h5')

            outF.write(' --model_files ')
            outF.write(model_path+folder)

            out_data_folder = folder+'_data'
            make_dir(out_path+out_data_folder)
            outF.write(' --out_data ')
            outF.write(out_path+out_data_folder+'/data.h5')

            out_report_folder = folder+'_report'
            make_dir(out_path+out_report_folder)
            outF.write(' --out_report ')
            outF.write(out_path+out_report_folder+'/report.tsv')
            outF.write("\n")

            outF.close()
            print(f'sh file {sh_file_name} is saved to {sh_file_path}')

    print('Done !')

In [48]:
sh_file_generator_eval(sh_file_path, model_path, test_data_path, src_test_data_path, out_path)

Created /scratch/sdodl001/ENCODE/sampled_data/GM12878_2/dna_0.1_frac_ss_input/ directory
files moved from /scratch/sdodl001/ENCODE/sampled_data/GM12878_2/test_input_ss/ to /scratch/sdodl001/ENCODE/sampled_data/GM12878_2/dna_0.1_frac_ss_input/
sh file dna_0.1_frac_ss_eval.sh is saved to /scratch/sdodl001/ENCODE/scripts/GM12878_ss_eval_job_files
Created /scratch/sdodl001/ENCODE/sampled_data/GM12878_2/dna_0.2_frac_ss_input/ directory
files moved from /scratch/sdodl001/ENCODE/sampled_data/GM12878_2/test_input_ss/ to /scratch/sdodl001/ENCODE/sampled_data/GM12878_2/dna_0.2_frac_ss_input/
sh file dna_0.2_frac_ss_eval.sh is saved to /scratch/sdodl001/ENCODE/scripts/GM12878_ss_eval_job_files
Created /scratch/sdodl001/ENCODE/sampled_data/GM12878_2/dna_0.3_frac_ss_input/ directory
files moved from /scratch/sdodl001/ENCODE/sampled_data/GM12878_2/test_input_ss/ to /scratch/sdodl001/ENCODE/sampled_data/GM12878_2/dna_0.3_frac_ss_input/
sh file dna_0.3_frac_ss_eval.sh is saved to /scratch/sdodl001/ENC