In [25]:
import os
import shutil

In [26]:
# file Utils
def dir_exists_abs(abs_dir_name):
    '''
    Check if a directory exists.
    '''
    return os.path.isdir(abs_dir_name)

def join_dir(first, *second):
    '''
    Join two directories.
    first: str
    *second: str list 
    '''
    ret = first
    for folder in second:
        if folder == '':
            continue 
        ret = os.path.join(ret, folder)
    return ret

def create_dir_current(cur_dir_name):
    '''
    creates a directory. 
    '''
    os.makedirs(cur_dir_name, exist_ok=True)

def remove_dir_current(cur_dir_name):
    '''
    removes a directory. 
    '''
    if dir_exists_abs(cur_dir_name):
        # remove non empty directory
        shutil.rmtree(cur_dir_name)

In [27]:
dataset_path = join_dir("dataset", "asteroid_dataset", "withils")

In [28]:
# in asteroid dataset
gt_file_name="true.true.geneTree.newick"
# gt_file_name = "raxml-ng.GTR+G.geneTree.newick"
species_tree_name="speciesTree.newick"
# species_tree_name= "fastrfs-raxml-ng_greedy.GTR+G.speciesTree.newick"

In [29]:
# for our experiment
gt_output_folder= "output"
gt_output_file_name= "1_gt.tre"
true_species_tree_name= "true-species.out.tre"

In [30]:
def remove_and_create_output_folder(dir_current_path):
    remove_dir_current(dir_current_path)
    create_dir_current(dir_current_path)

In [31]:
import re

def remove_one(input_str):
    output_str = re.sub(r"1(?=[^\d_])", "", input_str)
    return output_str

def remove_numbers_after_colon(input_str):
    output_str = re.sub(r':[^,\)]+', '', input_str)
    output_str = remove_one(output_str)
    return output_str

def remove_numbers_before_and_after_colon(input_str):
    output_str = re.sub(r'((\d+\.\d+)|\d+):[^,\)]+', '', input_str)
    return output_str

def remove_numbers_after_colon_NO_NEED_TO_REMOVE_ONE(input_str):
    output_str = re.sub(r':[^,\)]+', '', input_str)
    return output_str

# like remove 0.42, 0.92 
def remove_numbers_after_decimal(input_str):
    output_str = re.sub(r'0\.\d+', '', input_str)
    return output_str

In [32]:
# provide families folder path
def get_all_GTs(families_folder):
    all_gts = []
    for dir in os.listdir(families_folder):
        # print(dir)
        # save mapping from mapping folder
        mapping={}
        mapping_dir="mappings"
        mapping_file_path=families_folder+"/"+dir+"/"+mapping_dir+"/treerecs_mapping.link"
        if os.path.exists(mapping_file_path):
            # print(mapping_file_path)
            with open(mapping_file_path) as f:
                for line in f:
                    # print(line)
                    (key, val) = line.split()
                    mapping[key] = val
        else:
            print("mapping file not found")
        if len(mapping) == 0:
            print("mapping is empty")
        # print(mapping)
        gt_tree_dir="gene_trees"
        gt_tree_file_path=families_folder+"/"+dir+"/"+gt_tree_dir+"/"+gt_file_name
        if not os.path.exists(gt_tree_file_path):
            print("file not exists")
            exit(1)
        with open(gt_tree_file_path) as f:
            line = f.readline()
            line = remove_numbers_after_colon(line)
            # print(line)
            for key in mapping:
                line = line.replace(key, mapping[key])
            all_gts.append(line)
            # print(line)
    #output to a file in output folder
    with open(gt_output_folder+"/"+gt_output_file_name, "w") as f:
        for gt in all_gts:
            f.write(gt+"\n")
            # f.write(gt)

In [33]:
def get_True_Species_Tree(source_folder):
    # copy species tree to output folder
    # species_tree_path = source_folder+"/species_trees/"+species_tree_name
    species_tree_path = join_dir(source_folder, "species_trees", species_tree_name)
    with open(species_tree_path) as f:
        line = f.readline()
        line = remove_numbers_after_colon_NO_NEED_TO_REMOVE_ONE(line)
        with open(gt_output_folder+"/"+true_species_tree_name, "w") as f:
            f.write(line)


In [34]:

def get_gts_trueST(numberOfSpecies):
    MODEL_CONDITION = numberOfSpecies
    i=0
    
    for model_condition in sorted(os.listdir(dataset_path)):
        if MODEL_CONDITION not in model_condition:
            continue
        
        print("Model Condition --> ", model_condition)

        remove_and_create_output_folder(gt_output_folder) 

        source_folder = join_dir(dataset_path, model_condition)

        families_folder = join_dir(source_folder, "families")

        get_all_GTs(families_folder) # get all gts from families folder in output folder named 1_gt.tre
        get_True_Species_Tree(source_folder) # get true species tree from species tree folder in output folder named true-species.out.tre
        
        i+=1
        if i>=1:
            break

In [35]:
# MODEL_CONDITIONS = ["s25", "s50", "s75", "s100"]
MODEL_CONDITIONS = [ "s25" ]
for model_condition in MODEL_CONDITIONS:
    get_gts_trueST(model_condition)

Model Condition -->  ssim_veryhighmiss_s25_f1000_sites100_GTR_bl1.0_d0.0_l0.0_t0.0_gc0.0_p0.0_pop50000000_ms0.6_mf0.6_seed3000
