In [1]:
import numpy as np
import pandas as pd
import os,sys

In [2]:
inputFile = './HEPData-ins1894408-v2-csv/Signaltemplates,DMsimp,spin-1,Monojet.csv'
# inputFile = './HEPData-ins1894408-v2-csv/CutflowforMonojet,DM,spin-1mediator.csv'

In [3]:
# Try to evaluate data if possible
def myeval(s):
    s = s.replace('#:','')
    s = s.strip()
    s = s.replace(' ','')
    try:
        return eval(s)
    except (ValueError,NameError,SyntaxError):
        return s

# Set default column names    
def fixColumnLabel(c):

    newC = c[:]
    newC = newC.replace('Coupling type','Coupling')
    newC = newC.replace('Production mode','Mode')
    newC = newC.replace('#:','').replace(' ','')
    newC = newC.replace(',','')
    while newC[-1] == '_':
        newC = newC[:-1]
    
    return newC

def getDictFrom(block):
    
    blockDict = {}
    isBins = False
    for line in block.split('\n'):
        if 'Recoil (GeV),Recoil (GeV) LOW,Recoil (GeV) HIGH' in line:
            isBins = True
            continue
        elif 'Cut stage,Fraction of passing events' in line:
            continue
        if len(line.split(',')) < 2:
            continue
        
        values = line.split(',')
        if isBins:
            values = values[1:4]
            values[0] = 'bin_'+values[0]
        columnName = '_'.join([v for v in values[:-1] if len(v.strip()) > 0])
        columnName = fixColumnLabel(columnName)
        val = myeval(values[-1])
        blockDict[columnName] = [val]
    
    return blockDict

In [4]:
# Get blocks for each parameter point (it is assumed they are separated by the '#: Coupling' tag)
with open(inputFile,'r') as f:
    data = f.read()
    blocks = data.split('#: Coupling')
    for ib,b in enumerate(blocks):
        blocks[ib] = 'Coupling'+b
# Skip header
blocks = blocks[1:]

In [5]:
# Define data frame from first block:
df = pd.DataFrame(getDictFrom(blocks[0]))

# Add remaining blocks to dataframe
for b in blocks[1:]:
    newDF = pd.DataFrame(getDictFrom(b))
    df = pd.concat([df,newDF],ignore_index=True)
    
# Sort according to model point
df.sort_values(['Coupling','Mode','$m_{med}$','$m_{DM}$','Data-takingperiod'],inplace=True,
              ascending=[False,False,True,True,True])    

In [6]:
df

Unnamed: 0,Coupling,Mode,$m_{med}$,$m_{DM}$,$g_{q}$,$g_{DM}$,Data-takingperiod,bin_250.0_280.0,bin_280.0_310.0,bin_310.0_340.0,...,bin_690.0_740.0,bin_740.0_790.0,bin_790.0_840.0,bin_840.0_900.0,bin_900.0_960.0,bin_960.0_1020.0,bin_1020.0_1090.0,bin_1090.0_1160.0,bin_1160.0_1250.0,bin_1250.0_1400.0
221,Vector,DM+Z(qq),100.0,1.0,0.25,1.0,2016,505.730000,298.610000,190.390000,...,6.166700,4.985100,2.990100,2.450800,2.107700,1.604500,1.226100,0.301740,0.165720,0.869460
479,Vector,DM+Z(qq),100.0,1.0,0.25,1.0,2017,520.960000,312.420000,198.100000,...,7.354400,4.489800,4.045700,2.688600,1.730300,0.833090,0.781760,1.042700,0.686150,0.792270
737,Vector,DM+Z(qq),100.0,1.0,0.25,1.0,2018,584.340000,352.630000,219.060000,...,7.947000,7.634300,6.090600,3.006000,1.424200,1.628800,0.503560,0.213380,0.782610,1.089600
223,Vector,DM+Z(qq),100.0,30.0,0.25,1.0,2016,484.910000,291.880000,183.480000,...,5.441300,4.229600,2.572800,1.491300,1.739800,0.534330,0.588580,0.292900,0.343160,0.505050
481,Vector,DM+Z(qq),100.0,30.0,0.25,1.0,2017,508.420000,312.660000,181.020000,...,5.483500,4.594600,2.756800,2.478000,1.100300,1.212800,0.337330,0.836170,0.487660,0.577270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288,Axial,DM+QCDjets,2500.0,1000.0,0.25,1.0,2017,17.229000,14.166000,10.936000,...,1.828200,1.367700,1.142400,1.098500,0.783460,0.636100,0.594040,0.381990,0.322040,0.844780
546,Axial,DM+QCDjets,2500.0,1000.0,0.25,1.0,2018,20.031000,16.245000,12.895000,...,2.446600,2.094700,1.469300,1.374400,1.016400,0.956840,0.689900,0.526230,0.484880,1.129600
31,Axial,DM+QCDjets,2500.0,1500.0,0.25,1.0,2016,0.029982,0.024051,0.019718,...,0.003742,0.002977,0.002482,0.002342,0.001825,0.001302,0.001259,0.000987,0.000920,0.002347
289,Axial,DM+QCDjets,2500.0,1500.0,0.25,1.0,2017,0.032637,0.026872,0.021935,...,0.004629,0.003457,0.002529,0.002458,0.001991,0.001499,0.001502,0.001132,0.000962,0.002792


In [7]:
# Save to pickle file
pickleFile = os.path.basename(inputFile).replace(',','_')
pickleFile = os.path.splitext(pickleFile)[0]
pickleFile = pickleFile+'_DF.pcl'
print('Saving to',pickleFile)
df.to_pickle(pickleFile)

Saving to Signaltemplates_DMsimp_spin-1_Monojet_DF.pcl
