In [2]:
import numpy as np
import pandas as pd
import uproot
import csv
import os
import glob
#import doctest
#doctest.testmod(verbose=True)

#### Choose file to work with:

In [3]:
def get_file_name(fname = "analysis.root"):
    
    file_name = fname
    if file_name is not '' and file_name.endswith('.root'):
        if os.path.isfile(file_name):
            return file_name
        else:
            print('\n No such file... Try again. \n')
            get_file_name()
    else:
        print ("\n No input or wrong file format given... Try again. \n")
        get_file_name()
        


In [4]:
def test_get_file_name():
    pass

#### Get the names of all the trees in the file

In [5]:
def get_tree_names(f_name):
    
    try:
        f = uproot.open(f_name)
        trees = f.keys()
        tree_names=[]

        print("\n Tree names successfully stored. They are: \n")
        for tree in trees:
            tree = str(tree)
            tree = tree[2:len(tree)-3]
            tree_names.append(tree)
            print(tree)
            print('\n')
            
        return tree_names

    except:
        print ("\n ERROR: Trees not found. Check your file before continuing...\n")



In [6]:
def test_get_tree_names():
    pass
    

#### Unrolling the trees and writing them in separate files

In [7]:
def unroll_tree(file_name, ttree, of_name):
    files_out= [] 
    data = uproot.open(file_name)[ttree]
    names = data.keys()

    #data.arrays(names)
    out = pd.DataFrame.from_dict(data.arrays(names), dtype= str)
    out.to_csv(of_name)
    files_out.append(of_name)
    print ('\nCreated csv file ' + of_name)

In [19]:
def root_tree_to_csv(overwrite = False):

    fname = get_file_name()
    woods = get_tree_names(fname)
    
    for tree in woods:    
        out_file_name = tree + '.csv'
        
        if os.path.isfile(out_file_name):
            if overwrite:
                print ("\nOverwriting tree {} in file {}".format(tree, fname))
                unroll_tree(fname, tree, out_file_name)                
            else:
                print( out_file_name + " already exist and will not be overwritten...")
                
        else: 
            print ("\nWriting tree {} in file {}".format(tree, fname))
            unroll_tree(fname, tree, out_file_name)

#### Adding 'label' column


In [18]:
def label_column_writer(infile, outfile, fsignal = "signal_bbA_MA300tree.csv"):
    reader = csv.reader(open(infile, 'r'))
    writer = csv.writer(open(outfile, 'w'))
    headers = next(reader)
    headers.append("label")

    writer.writerow(headers)
    
    for row in reader:
        if infile == fsignal:
            row.append("signal")
            writer.writerow(row)
        else:
            row.append("background")
            writer.writerow(row)

def select_files():
    allfiles = [f for f in glob.glob('*.{}'.format('csv'))]
    files_to_modify = []

    #select files to modify
    for f in allfiles:
        answer = input('\nDo you want to add label column to the file "{}"? y or n'.format(f))
        if answer is 'y':
            files_to_modify.append(f)
        else:
            continue             
    return files_to_modify

In [49]:
def add_label_column(f_to_modify = [], overwrite = False):
    try:
        files = f_to_modify
        print("The following files will be modified: \n")
        print (files)
        print('\n')
        if len(files) is not 0:
            for file in files:
                out_file = "l_"+ file
                if os.path.isfile(out_file):
                    if overwrite:
                        print( "\nOverwriting " + out_file) 
                        label_column_writer(file, out_file)
                    else: 
                        print( out_file + " already exist and will not be overwritten..\n")
                else:
                    print( "\nWriting " + out_file)
                    label_column_writer(file, out_file)

    except:
        print('\n ERROR: No or invalid csv file found... \n')

#### merging files

In [40]:
def file_merger(outfile_name, overwrite = False):
    
    try:

        all_filenames = [f for f in glob.glob('l_*')]
        #combine all files in the list
        combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ], sort = False)
        #pick a name for the new file
        file_out_name = outfile_name
        if os.path.isfile(file_out_name):
            if overwrite:
                print("Overwriting already existing file " + file_out_name)
                combined_csv.to_csv( file_out_name, index=False, encoding='utf-8-sig')
            else:
                print( file_out_name + " already exist and will not be overwritten...\n")
        else:
        #export to csv
            combined_csv.to_csv( file_out_name, index=False, encoding='utf-8-sig')
    except:
        print('\n ERROR: No columns to parse from file...')

In [41]:
root_tree_to_csv()


 Tree names successfully stored. They are: 

signal_bbA_MA300tree


bkg_DY_nlo1tree


bkg_ttbar_nlotree


signal_bbA_MA300tree.csv already exist and will not be overwritten...
bkg_DY_nlo1tree.csv already exist and will not be overwritten...
bkg_ttbar_nlotree.csv already exist and will not be overwritten...


In [50]:
add_label_column(["signal_bbA_MA300tree.csv", "bkg_DY_nlo1tree.csv", "bkg_ttbar_nlotree.csv"])

The following files will be modified: 

['signal_bbA_MA300tree.csv', 'bkg_DY_nlo1tree.csv', 'bkg_ttbar_nlotree.csv']


l_signal_bbA_MA300tree.csv already exist and will not be overwritten..

l_bkg_DY_nlo1tree.csv already exist and will not be overwritten..

l_bkg_ttbar_nlotree.csv already exist and will not be overwritten..



In [36]:
file_merger(outfile_name="analysis.csv")

analysis.csv already exist and will not be overwritten...

