In [3]:
import numpy as np
import pandas as pd
import uproot
import csv
import os
import glob
import unittest
#import doctest
#doctest.testmod(verbose=True)

#### Choose file to work with:

In [4]:
def get_file_name(fname = "analysis.root"):
    
    file_name = fname
    if file_name is not '' and file_name.endswith('.root'):
        if os.path.isfile(file_name):
            return file_name
        else:
            print('\n No such file... Try again. \n')
            get_file_name()
    else:
        print ("\n No input or wrong file format given... Try again. \n")
        get_file_name()
        


#### Get the names of all the trees in the file

In [5]:
def get_tree_names(f_name):
    
    try:
        f = uproot.open(f_name)
        trees = f.keys()
        tree_names=[]

        print("\n Tree names successfully stored. They are: \n")
        for tree in trees:
            tree = str(tree)
            tree = tree[2:len(tree)-3]
            tree_names.append(tree)
            print(tree)
            print('\n')
            
        return tree_names

    except (ValueError, FileNotFoundError):
        print ("\n ERROR: Trees or File not found. Check before continuing...\n")
        raise


#### Unrolling the trees and writing them in separate files

In [6]:
def unroll_tree(file_name, ttree, of_name):
    files_out= [] 
    data = uproot.open(file_name)[ttree]
    names = data.keys()

    try:
        out = pd.DataFrame.from_dict(data.arrays(names), dtype= str)
        out.to_csv(of_name)
        files_out.append(of_name)
        print ('\nCreated csv file ' + of_name)
    except (ValueError, FileNotFoundError):
        raise

In [195]:
def root_tree_to_csv(file, overwrite = False):
    try:
        if not os.path.isfile(file):
            raise FileNotFoundError

        elif not file.endswith('.root'):
            raise ValueError

        fname = file
        woods = get_tree_names(fname)
        files_out = []

        for tree in woods:    
            out_file_name = tree + '.csv'
            files_out.append(out_file_name)

            if os.path.isfile(out_file_name):
                if overwrite:
                    print ("\nOverwriting tree {} in file {}".format(tree, fname))
                    unroll_tree(fname, tree, out_file_name)                
                else:
                    print( out_file_name + " already exist and will not be overwritten...")

            else: 
                print ("\nWriting tree {} in file {}".format(tree, fname))
                unroll_tree(fname, tree, out_file_name)

        return files_out
    except:
        print ('ERROR: No file or wrong file format...')
        raise

#### Adding 'label' column


In [106]:
def label_column_writer(infile, outfile, fsignal = "signal_bbA_MA300tree.csv"):
    if not infile.endswith('.csv') or not outfile.endswith('.csv'):
        raise ValueError
    reader = csv.reader(open(infile, 'r'))
    writer = csv.writer(open(outfile, 'w'))
    headers = next(reader)
    headers.append("label")

    writer.writerow(headers)
    
    for row in reader:
        if infile == fsignal:
            row.append("signal")
            writer.writerow(row)
        else:
            row.append("background")
            writer.writerow(row)

In [183]:
def add_label_column(f_to_modify = [], overwrite = False):
    try:
        if f_to_modify == []: 
            raise FileNotFoundError
        
        for file in f_to_modify:
            if not os.path.isfile(file):
                raise FileNotFoundError
            elif not file.endswith('.csv'):
                raise ValueError
        
        else:
            files = f_to_modify
            out_files = []
            print("The following files will be modified: \n")
            print (files)
            print('\n')
            if len(files) is not 0:
                for file in files:
                    out_file = "l_"+ file
                    out_files.append(out_file)
                    if os.path.isfile(out_file):
                        if overwrite:
                            print( "\nOverwriting " + out_file) 
                            label_column_writer(file, out_file)
                        else: 
                            print( out_file + " already exist and will not be overwritten..\n")
                    else:
                        print( "\nWriting " + out_file)
                        label_column_writer(file, out_file)
            return out_files
    except:
        print('\n ERROR: No or invalid csv file found... \n')
        raise

#### merging files

In [137]:
def file_merger(infile_names, outfile_name = 'output.csv', overwrite = False):
    
    file_out_name = outfile_name
    infiles = []
    
    for file in infile_names:
        if os.path.isfile(file):
            infiles.append(file)
        else:
            print (file)
            raise FileNotFoundError
        
    if not file_out_name.endswith('.csv'):
        file_out_name = file_out_name + ".csv"
        
    try:
        if os.path.isfile(file_out_name):
            if not overwrite: 
                print( file_out_name + " already exist and will not be overwritten...\n")
        
        
        
            if overwrite:
                
                print("Overwriting already existing file " + file_out_name)
                
                all_filenames = [f for f in infiles]
                combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ], sort = False)
                
                
                combined_csv.to_csv( file_out_name, index=False, encoding='utf-8-sig')
        else:
            print('Writing {} file'.format(file_out_name))
            all_filenames = [f for f in infiles]
            combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ], sort = False)
            combined_csv.to_csv( file_out_name, index=False, encoding='utf-8-sig')
    except:
        print('\n ERROR: No columns to parse from file...')
        raise

In [138]:
FILE = get_file_name('analysis.root')


In [139]:
CSV_FILES = root_tree_to_csv(file = FILE)


 Tree names successfully stored. They are: 

signal_bbA_MA300tree


bkg_DY_nlo1tree


bkg_ttbar_nlotree


signal_bbA_MA300tree.csv already exist and will not be overwritten...
bkg_DY_nlo1tree.csv already exist and will not be overwritten...
bkg_ttbar_nlotree.csv already exist and will not be overwritten...


In [53]:
L_CSV_FILES = add_label_column(CSV_FILES)

The following files will be modified: 

['signal_bbA_MA300tree.csv', 'bkg_DY_nlo1tree.csv', 'bkg_ttbar_nlotree.csv']


l_signal_bbA_MA300tree.csv already exist and will not be overwritten..

l_bkg_DY_nlo1tree.csv already exist and will not be overwritten..

l_bkg_ttbar_nlotree.csv already exist and will not be overwritten..



In [54]:
for file in L_CSV_FILES:
    print (file)

l_signal_bbA_MA300tree.csv
l_bkg_DY_nlo1tree.csv
l_bkg_ttbar_nlotree.csv


In [55]:
file_merger(infile_names= L_CSV_FILES, outfile_name="analysis.csv", overwrite = True)

Overwriting already existing file analysis.csv


In [198]:
class TestNotebook(unittest.TestCase):
    
    def test_file_merger(self):
        with self.assertRaises(ValueError):
            file_merger([])
            
    def test_file_merger_1(self):
        with self.assertRaises(ValueError):
            f = open("a.root", 'w+')
            file_merger(f)
            os.remove("a.root")

    def test_file_merger_2(self):
        with self.assertRaises(FileNotFoundError):
            file_merger('b.root')
     
    
    
    def test_add_label_column(self):
        with self.assertRaises(FileNotFoundError):
            add_label_column([])
            
    def test_add_label_column_1(self):
        with self.assertRaises(ValueError):
            f = open("a.root", 'w+')
            add_label_column(["a.root"])
            os.remove("a.root")
            
    def test_add_label_column_2(self):
        with self.assertRaises(FileNotFoundError):
            os.remove("a.csv")
            add_label_column(["a.csv"])
    
    
    def test_label_column_writer(self):
        with self.assertRaises(FileNotFoundError):
            label_column_writer('a.csv', 'b.csv')
            
    def test_label_column_writer_1(self):
        with self.assertRaises(ValueError):
            label_column_writer('a.root', 'b.csv')
            
    def test_label_column_writer_2(self):
        with self.assertRaises(ValueError):
            f = open("a.csv", 'w+')
            label_column_writer('a.csv', 'b.root')
            os.remove("a.csv")
    
    
    def test_root_tree_to_csv(self):
        with self.assertRaises(ValueError):
            f = open("a.root", 'w+')
            get_tree_names("a.root")
            os.remove("a.root")
    def test_root_tree_to_csv_1(self):
        with self.assertRaises(ValueError):
            f = open("a.csv", 'w+')
            get_tree_names("a.csv")
            os.remove("a.csv")
    def test_root_tree_to_csv_2(self):
        with self.assertRaises(FileNotFoundError):
            get_tree_names("b.root")
    

    def test_unroll_tree(self):
        with self.assertRaises(ValueError):
            f = open("a.root", 'w+')
            unroll_tree("a.root", "tree", "out_f")
            os.remove("a.root")
    def test_unroll_tree_1(self):
        with self.assertRaises(FileNotFoundError):
            unroll_tree("b.root", "tree", "out_f")

      
    
    
    def test_get_tree_names(self):
        with self.assertRaises(FileNotFoundError):
            get_tree_names("b.root")
            
    def test_get_tree_names_1(self):
        with self.assertRaises(ValueError):
            f = open("a.root", 'w+')
            get_tree_names("a.root")
            os.remove("a.root")

    def test_get_file_name(self):
        f = open("a.root", 'w+')
        self.assertEqual(get_file_name("a.root"), "a.root")
        os.remove("a.root")
    
    
    def test_get_file_name_1(self):
        self.assertFalse(get_file_name("a"), "a")
        
    def test_get_file_name_2(self):
        self.assertFalse(get_file_name(""), "")
    
    

In [199]:
def clean():
    os.remove('a.root')
    os.remove('a.csv')
    #os.remove('b.csv')


      

unittest.main(argv=[''], verbosity=2, exit=False)
clean()


test_add_label_column (__main__.TestNotebook) ... ok
  testMethod()
ok
test_add_label_column_2 (__main__.TestNotebook) ... ok
test_file_merger (__main__.TestNotebook) ... ok
test_file_merger_1 (__main__.TestNotebook) ... ok
test_file_merger_2 (__main__.TestNotebook) ... ok
test_get_file_name (__main__.TestNotebook) ... ok
test_get_file_name_1 (__main__.TestNotebook) ... ok
test_get_file_name_2 (__main__.TestNotebook) ... ok
test_get_tree_names (__main__.TestNotebook) ... ok
test_get_tree_names_1 (__main__.TestNotebook) ... ok
test_label_column_writer (__main__.TestNotebook) ... ok
test_label_column_writer_1 (__main__.TestNotebook) ... ok
  testMethod()
ok
test_root_tree_to_csv (__main__.TestNotebook) ... ok
test_root_tree_to_csv_1 (__main__.TestNotebook) ... ok
test_root_tree_to_csv_2 (__main__.TestNotebook) ... ok
test_unroll_tree (__main__.TestNotebook) ... ok
test_unroll_tree_1 (__main__.TestNotebook) ... 


 ERROR: No or invalid csv file found... 


 ERROR: No or invalid csv file found... 

Writing output.csv file

 ERROR: No columns to parse from file...
Writing output.csv file

 ERROR: No columns to parse from file...
b

 No input or wrong file format given... Try again. 


 No input or wrong file format given... Try again. 


 ERROR: Trees or File not found. Check before continuing...


 ERROR: Trees or File not found. Check before continuing...


 ERROR: Trees or File not found. Check before continuing...


 ERROR: Trees or File not found. Check before continuing...


 ERROR: Trees or File not found. Check before continuing...



ok

----------------------------------------------------------------------
Ran 19 tests in 0.018s

OK
