# How to use:

This notebook automatically downloads the following two datasets from the CERN Open Data Portal and combines them to the dataset used in the thesis and lab course:

- Run 2012B: https://opendata.cern.ch/record/12365
- Run 2012C: https://opendata.cern.ch/record/12366

This notebook extracts all the neccessary information from the original files in the root format and saves them to a zipped CSV file for easy use.
Some basic filtering is performed.

Size of download: 8 GB

Size of final dataset: 1.4 GB

Name of final dataset: Run2012BC_DoubleMuons_prefiltered.zip

The flag `keepOriginalFiles` can be set to True if the original .root files from the CERN Open Data Portal are to be kept.

In [1]:
import pandas as pd
import uproot
import numpy as np
import awkward as ak
import requests as rq
import os

keepOriginalFiles = False # Whether Original .root files are kept 


In [2]:
#download the files from the CERN Open Data Portal
#total size 7.7 GB -> this may take a while

urlRunB = "https://opendata.cern.ch/record/12365/files/Run2012B_DoubleMuParked.root"
urlRunC = "https://opendata.cern.ch/record/12366/files/Run2012C_DoubleMuParked.root"

fileB = rq.get(urlRunB)
fileNameB = fileB.headers["content-disposition"].split("=")[-1]

fileC = rq.get(urlRunC)
fileNameC = fileC.headers["content-disposition"].split("=")[-1]

In [3]:
#save files to disk
with open(fileNameB, 'wb') as file:
    file.write(fileB.content)
with open(fileNameC, 'wb') as file:
    file.write(fileC.content)

In [6]:
with uproot.open(fileNameB+":Events") as events1, uproot.open(fileNameC+":Events") as events2: 
    
    #create boolean mask to filter out events with exactly 2 muons
    mask1 = events1['nMuon'].array(library ="np") == 2
    mask2 = events2['nMuon'].array(library ="np") == 2
    
    #create new df to copy values into 
    colNames =  ['pt','eta','phi','Q','dxy','dz','Iso3'] #names of columns in df
    #the columns dxy, dz and Iso3 are currently only in use in the analysis behind the thesis
    nCols = len(colNames) #how many columns per particle
    df = pd.DataFrame(columns = pd.MultiIndex.from_arrays([nCols*['mu1']+nCols*['mu2'],colNames+colNames])) 
    
    #dictionary to translate from root column names to df column names
    #names from root file
    rootCols = ['Muon_pt','Muon_eta','Muon_phi','Muon_charge','Muon_dxy','Muon_dz','Muon_pfRelIso03_all']
    names = dict(zip(rootCols,colNames))
    
    for c in rootCols:
        data1 = ak.to_numpy(ak.Array.__getitem__(events1[c].array(),mask1))
        data2 = ak.to_numpy(ak.Array.__getitem__(events2[c].array(),mask2))
        data = np.concatenate((data1,data2)).T
        df["mu1",names[c]] = data[0]
        df["mu2",names[c]] = data[1]
        
    #basic filtering
    df = df[(df.mu1.Iso3 >= 0) & (df.mu2.Iso3 >= 0)]
    df = df.sort_index(axis=1) 
    
    #save as zipped csv
    df.to_csv("Run2012BC_DoubleMuons_prefiltered.zip",index = False)

if keepOriginalFiles == False:
    os.remove(fileNameB)
    os.remove(fileNameC)