In [2]:
import numpy as np
import pandas as pd
import uproot
import csv
import os
import glob
import unittest
#import doctest
#doctest.testmod(verbose=True)

#### Choose file to work with:

In [3]:
def get_file_name(fname = "analysis.root"):
    
    file_name = fname
    if file_name is not '' and file_name.endswith('.root'):
        if os.path.isfile(file_name):
            return file_name
        else:
            print('\n No such file... Try again. \n')
            get_file_name()
    else:
        print ("\n No input or wrong file format given... Try again. \n")
        get_file_name()
        


#### Get the names of all the trees in the file

In [79]:
def get_tree_names(f_name):
    
    try:
        f = uproot.open(f_name)
        trees = f.keys()
        tree_names=[]

        print("\n Tree names successfully stored. They are: \n")
        for tree in trees:
            tree = str(tree)
            tree = tree[2:len(tree)-3]
            tree_names.append(tree)
            print(tree)
            print('\n')
            
        return tree_names

    except (ValueError, FileNotFoundError):
        print ("\n ERROR: Trees or File not found. Check before continuing...\n")
        raise


In [80]:
get_tree_names("a.root")


 ERROR: Trees or File not found. Check before continuing...



ValueError: cannot mmap an empty file

#### Unrolling the trees and writing them in separate files

In [83]:
def unroll_tree(file_name, ttree, of_name):
    files_out= [] 
    data = uproot.open(file_name)[ttree]
    names = data.keys()

    try:
        out = pd.DataFrame.from_dict(data.arrays(names), dtype= str)
        out.to_csv(of_name)
        files_out.append(of_name)
        print ('\nCreated csv file ' + of_name)
    except (ValueError, FileNotFoundError):
        raise

In [None]:
def root_tree_to_csv(file, overwrite = False):
    fname = file
    woods = get_tree_names(fname)
    files_out = []
    
    for tree in woods:    
        out_file_name = tree + '.csv'
        files_out.append(out_file_name)
        
        if os.path.isfile(out_file_name):
            if overwrite:
                print ("\nOverwriting tree {} in file {}".format(tree, fname))
                unroll_tree(fname, tree, out_file_name)                
            else:
                print( out_file_name + " already exist and will not be overwritten...")
                
        else: 
            print ("\nWriting tree {} in file {}".format(tree, fname))
            unroll_tree(fname, tree, out_file_name)
        
    return files_out

#### Adding 'label' column


In [None]:
def label_column_writer(infile, outfile, fsignal = "signal_bbA_MA300tree.csv"):
    reader = csv.reader(open(infile, 'r'))
    writer = csv.writer(open(outfile, 'w'))
    headers = next(reader)
    headers.append("label")

    writer.writerow(headers)
    
    for row in reader:
        if infile == fsignal:
            row.append("signal")
            writer.writerow(row)
        else:
            row.append("background")
            writer.writerow(row)

def select_files():
    allfiles = [f for f in glob.glob('*.{}'.format('csv'))]
    files_to_modify = []

    #select files to modify
    for f in allfiles:
        answer = input('\nDo you want to add label column to the file "{}"? y or n'.format(f))
        if answer is 'y':
            files_to_modify.append(f)
        else:
            continue             
    return files_to_modify

In [None]:
def add_label_column(f_to_modify = [], overwrite = False):
    try:
        files = f_to_modify
        out_files = []
        print("The following files will be modified: \n")
        print (files)
        print('\n')
        if len(files) is not 0:
            for file in files:
                out_file = "l_"+ file
                out_files.append(out_file)
                if os.path.isfile(out_file):
                    if overwrite:
                        print( "\nOverwriting " + out_file) 
                        label_column_writer(file, out_file)
                    else: 
                        print( out_file + " already exist and will not be overwritten..\n")
                else:
                    print( "\nWriting " + out_file)
                    label_column_writer(file, out_file)
        return out_files
    except:
        print('\n ERROR: No or invalid csv file found... \n')

#### merging files

In [None]:
def file_merger(infile_names, outfile_name, overwrite = False):
    
    file_out_name = outfile_name
    infiles = infile_names
    if not file_out_name.endswith('.csv'):
        file_out_name = file_out_name + ".csv"
    try:
        if os.path.isfile(file_out_name):
            if not overwrite: 
                print( file_out_name + " already exist and will not be overwritten...\n")
        
        
        
            if overwrite:
                
                print("Overwriting already existing file " + file_out_name)
                
                all_filenames = [f for f in infile]
                combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ], sort = False)
                
                
                combined_csv.to_csv( file_out_name, index=False, encoding='utf-8-sig')
        else:
            print('Writing {} file'.format(file_out_name))
            all_filenames = [f for f in glob.glob('l_*')]
            combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ], sort = False)
            combined_csv.to_csv( file_out_name, index=False, encoding='utf-8-sig')
    except:
        print('\n ERROR: No columns to parse from file...')

In [None]:
FILE = get_file_name()


In [None]:
CSV_FILES = root_tree_to_csv(file = FILE)

In [None]:
L_CSV_FILES = add_label_column(CSV_FILES)

In [None]:
file_merger(infile_names= L_CSV_FILES, outfile_name="analysis.csv")

In [86]:
class TestNotebook(unittest.TestCase):
    
    def test_file_merger(self):
        pass

    def test_add_label_column(self):
        pass

    def test_label_column_writer(self):
        pass
    
    def test_root_tree_to_csv(self):
        pass
    
    
    def test_unroll_tree_1(self):
        with self.assertRaises(FileNotFoundError):
            unroll_tree("b.root", "tree", "out_f")
        
        
    def test_unroll_tree(self):
        with self.assertRaises(ValueError):
            f = open("a.root")
            unroll_tree("a.root", "tree", "out_f")
            
            
    def test_get_tree_names(self):
        with self.assertRaises(FileNotFoundError):
            get_tree_names("b.root")
            
    def test_get_tree_names_1(self):
        with self.assertRaises(ValueError):
            f = open("a.root")
            get_tree_names("a.root")
            f.close()
        
    def test_get_file_name(self):
        f = open("a.root")
        self.assertEqual(get_file_name("a.root"), "a.root")
        f.close()
        
    def test_get_file_name_1(self):
        self.assertFalse(get_file_name("a"), "a")
        
    def test_get_file_name_2(self):
        self.assertFalse(get_file_name(""), "")

In [87]:
unittest.main(argv=[''], verbosity=2, exit=False)


test_add_label_column (__main__.TestNotebook) ... ok
test_file_merger (__main__.TestNotebook) ... ok
test_get_file_name (__main__.TestNotebook) ... ok
test_get_file_name_1 (__main__.TestNotebook) ... ok
test_get_file_name_2 (__main__.TestNotebook) ... ok
test_get_tree_names (__main__.TestNotebook) ... ok
  testMethod()
ok
test_label_column_writer (__main__.TestNotebook) ... ok
test_root_tree_to_csv (__main__.TestNotebook) ... ok
test_unroll_tree (__main__.TestNotebook) ... ok
test_unroll_tree_1 (__main__.TestNotebook) ... 


 No input or wrong file format given... Try again. 


 No input or wrong file format given... Try again. 


 ERROR: Trees or File not found. Check before continuing...


 ERROR: Trees or File not found. Check before continuing...



ok

----------------------------------------------------------------------
Ran 11 tests in 0.009s

OK


<unittest.main.TestProgram at 0x7fdfa5bb11d0>