# Introduction

The COMPAS core code gives a collection of CSV files.
We convert it into a single h5 file which is more compact and easier to upload and handle


To core-script is csv_to_h5.py placed in the H5 folder in the postprocessing
and should be self-explanatory (:fingers_crossed:)

This notebook is a remainder of the code construction and testing mostly for developers

This notebook is for combining the COMPAS data into a single h5 file
with some additional options. This is mainly is a remainder of the desting purposes


Here we show how the code is constructed.
Note that this is the combined effort of many group members hence no single
person could make claim.



In [1]:
import h5py  as h5  #for reading and writing h5 format
import numpy as np  #for handling arrays
import sys          #for handling paths and more
import os           #for directory walking
import subprocess as sp #for executing terminal command from python

# The output from COMPAS

These are the outputs in a single folder from a COMPAS run.
The files are human-readable csv files and are separated into different topics.
Here I list what we currently have

In [2]:
#How the output is called from COMPAS

fileNames = {1:'Compas_Log_BSE_Common_Envelopes.csv',\
           2:'Compas_Log_BSE_Double_Compact_Objects.csv',\
           3:'Compas_Log_BSE_RLOF.csv',\
           4:'Compas_Log_BSE_Supernovae.csv',\
           5:'Compas_Log_BSE_System_Parameters.csv',\
           6:'errorfile',\
           7:'output',\
          }

#How the groups will be called in H5

groupNames = {1:'CommonEnvelopes',\
              2:'DoubleCompactObjects',\
              3:'RLOF',\
              4:'Supernovae',\
              5:'SystemParameters',\
              6:'errorFile',\
              7:'output'}

# Combining folders

In the case of a large simulation we run it on multiple cores.
Each core runs in its own subfolder of a parent folder of the simulation.
Before we write an h5 file, we want to combine all the files.

Main idea is to use a directory walker and write to a single csv.
Main trickyness is to write the header only once.

In [3]:
def combineOutputsOfFile(baseDirectoryData='.', groups=[1,2,3,4]):
    """
    For a simulation in folder baseDirectory
    1 - write a new file name=Combined_'filename'
    2 - go through all the subfolders and find the filename
    3 - Of the first file with filename, copy the header 
        and write the data to Combined_'filename'
    4 - Of other files with filename, copy the data
    5 - close the newly written file
    
    input = filename , string(with extension)
    
    COMPAS has a strict header format which is assumed here
    
    1st line = type data (INT, FLOAT BOOL etc)
    2nd line = unit data (Msol, ergs,Tsol etc)
    3th line = column name
    """
    for group in groups:
        filename       = fileNames[group]
                         
        #1-------
        combinedOutput = open(baseDirectoryData+'/Combine_'+filename, 'w')
        #boolean to see if we have written the header
        headersWritten = False
        nHeaders       = 3
        nColumnCheck   = None #check each line if nr of entries is the same
                              #if not there is somthing wrong
        #2---- 
        for root,dirs,files in os.walk(baseDirectoryData):

            for f in files:

                if f == filename:

                    path = os.path.join(root, f)

                    #individual output file of run in subfolder
                    outputFile   = open(path)
                    #3--------------
                    if not headersWritten:
                        for i in range(nHeaders):
                            line = outputFile.readline()


                            line  = line.replace(" ", "")
                            line  = line.replace(",", "\t")
                            nCols = len(line.split('\t'))
                            if i ==0: #set the column number check for first line
                                nColumnCheck = nCols
                            if(nCols != nColumnCheck):
                                raise ValueError('wrong number of columns in header=%s'%(i))
                            combinedOutput.write(line)
                        headersWritten = True
                    else:
                        #skip the header by reading and not doing anything
                        [outputFile.readline() for i in range(nHeaders)]

                    #4 -----------
                    for line in outputFile:
                        nCols = len(line.split(','))
                        line  = line.replace(",", "\t")
                        if(nCols != nColumnCheck):
                            raise ValueError('wrong number of columns in data')
                        combinedOutput.write(line)
        #5----------
        combinedOutput.close()
    


# Writing the H5 file

This writes the combined files to an H5file, note that even for a
single simulation on a single core this works, since CombinedFile==Original file

In [5]:
def addHdf5HeadersAndAttributes(hf,  groupName, filePath):
    """
    COMPAS has a strict header format which is assumed here
    
    1st line = type data (INT, FLOAT BOOL etc)
    2nd line = unit data (Msol, ergs,Tsol etc)
    3th line = column name
    """
    
    file      = open(filePath, 'r')
    #get header, units names
    types     = file.readline()[:-1].split('\t')
    units     = file.readline()[:-1].split('\t')
    headers   = file.readline()[:-1].split('\t')
    #how many entries will a column in the group have?
    #get the length of the file (minus headers)
    fileLength = int(sp.check_output('wc -l ' + filePath, \
                                     shell=True).split()[0]) - 3
    file.close() # only needed the headers here
    #types is strings need to replace by actual type for h5
    dtypes    = []
    print(headers)
    for nrt, typ in enumerate(types):
        if typ == 'INT':
            dtypes.append(np.int64)
        elif typ == 'FLOAT':
            dtypes.append(np.float64)
        elif typ == 'BOOL':
            dtypes.append(bool)
        else:
            raise ValueError("Unrecognised datatype typ=%s - for column %s in file%s "\
                             %(typ, headers[nrt], groupName))
    #create the groups in the h5file and add units and explanation string
    for header,dtype,unit in zip(headers,dtypes,units):
        dset = hf[groupName].create_dataset(header,dtype=dtype,shape=(fileLength,))
        dset.attrs['units'] = unit
        #dset.attrs['comment']= columnDescriptions
        
    return

def addHdf5Data(hf,  groupName, filePath):
    
    #too slow to go line by line, so load in a modest 
    #(in term sof memory) amount at a time
    chunkSize = 500000
    
    #get the length of the file (minus headers)
    fileLength = int(sp.check_output('wc -l ' + filePath, \
                                      shell=True).split()[0]) - 3

    file      = open(filePath)
    types     = file.readline()[:-1].split('\t')
    units     = file.readline()[:-1].split('\t')
    headers   = file.readline()[:-1].split('\t')
    
    nrColumns   = len(headers)
    group       = hf[groupName]
    chunkBegin = 0
    chunkEnd = 0    
    while chunkEnd < fileLength:
        data = []

        chunkEnd = chunkBegin + chunkSize

        #dont try to load in more data than you've got
        if chunkEnd > fileLength:
            chunkEnd = fileLength

        #read in a modest number of lines
        for i in range(chunkEnd-chunkBegin):
            data.append(file.readline()[:-1].split())
            
        data = np.array(data)
            
        #data is now a 2d array where each column is a specific variable

        
        for nrcolumn in range(nrColumns):
            #fill in the values in the preshaped array
            #(see dset addHdf5HeadersAndAttributes() )
            
            columnName        = headers[nrcolumn]
            dtype             = type(group[columnName][0])
            group[columnName][chunkBegin:chunkEnd] = np.array(data[:,nrcolumn],dtype=dtype)
            
        chunkBegin = chunkEnd

In [6]:
    
def createH5file(baseDirectoryData='.', groups=[1,2,3,4], h5Name='COMPAS_output.h5'):
    
    hf = h5.File(baseDirectoryData+'/'+h5Name, 'w')
    
    #use the groupNames dictionary to create 
    #the H5file name group. Each of these we will fill with the header and data
    for groupNumber in groups:
        groupName  = groupNames[groupNumber]
        fileName   = '/Combine_'+fileNames[groupNumber]
        filePath   = baseDirectoryData+fileName
        hf.create_group(groupName)
        addHdf5HeadersAndAttributes(hf, groupName, filePath)
        addHdf5Data(hf, groupName, filePath)
    hf.close()

In [7]:
def cleanUpInIsleNumber2Please(baseDirectoryData='.', groups=[1,2,3,4]):
    
    for groupNumber in groups:
        fileName   = '/Combine_'+fileNames[groupNumber]
        filePath   = baseDirectoryData+fileName
        command    = 'rm '+filePath
        sp.Popen(command, shell=True, executable='/bin/bash')

In [9]:
# location where .csv files live
pathToData =  '/Users/floorbroekgaarden/Programs/githubCOMPAS/COMPAS/src'
groups = [1,2,3,4,5]
combineOutputsOfFile(baseDirectoryData=pathToData, groups=groups)
createH5file(baseDirectoryData=pathToData, groups=groups)
cleanUpInIsleNumber2Please(baseDirectoryData=pathToData, groups=groups)

['ID', 'SEED', 'Time', 'CE_Alpha', 'Lambda@CE_1', 'Lambda@CE_2', 'Binding_Energy<CE_1', 'Binding_Energy<CE_2', 'Eccentricity<CE', 'Eccentricity>CE', 'Separation<CE', 'Separation>CE', 'RocheLobe_1<CE', 'RocheLobe_1>CE', 'RocheLobe_2<CE', 'RocheLobe_2>CE', 'MZAMS_1', 'Mass_1<CE', 'Mass_Env_1', 'Core_Mass_1', 'Radius_1<CE', 'Radius_1>CE', 'Stellar_Type_1<CE', 'Stellar_Type_1', 'Lambda_Fixed_1', 'Lambda_Nanjing_1', 'Loveridge_1', 'Loveridge_Winds_1', 'Kruckow_1', 'BE_Fixed_1', 'BE_Nanjing_1', 'BE_Loveridge_1', 'BE_Loveridge_Winds_1', 'BE_Kruckow_1', 'MZAMS_2', 'Mass_2<CE', 'Mass_Env_2', 'Core_Mass_2', 'Radius_2<CE', 'Radius_2>CE', 'Stellar_Type_2<CE', 'Stellar_Type_2', 'Lambda_Fixed_2', 'Lambda_Nanjing_2', 'Loveridge_2', 'Loveridge_Winds_2', 'Kruckow_2', 'BE_Fixed_2', 'BE_Nanjing_2', 'BE_Loveridge_2', 'BE_Loveridge_Winds_2', 'BE_Kruckow_2', 'MT_History', 'Merger', 'Optimistic_CE', 'CE_Event_Count', 'Double_Core_CE', 'RLOF_1', 'Luminosity<CE_1', 'Teff<CE_1', 'Tau_Dynamical<CE_1', 'Tau_Therm

In [13]:
# as a test to see if the output file has been created, read in a line of the supernova data
f  = h5.File(pathToData+'/COMPAS_output.h5')
print()
print(f['Supernovae']['SN_Theta_SN'][()])


[-0.4064324   0.02165689  0.4440852  ...  0.4121895   0.6409085
  0.09198198]
