<a href="https://colab.research.google.com/github/Rdalzell12/Bayesian-Analysis-of-the-Higgs-Boson-Discovery/blob/main/Another_copy_of_8_TeV_Initial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Connecting Colab to Google Drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Importing 8 TeV data

!ls /content/drive/MyDrive/ULAB/complete_set_of_ATLAS_open_data_samples_July_2016.zip

/content/drive/MyDrive/ULAB/complete_set_of_ATLAS_open_data_samples_July_2016.zip


In [None]:
#Unzipping the file

import os
zip_path = '/content/drive/MyDrive/ULAB/complete_set_of_ATLAS_open_data_samples_July_2016.zip'
extract_path = '/content/local_data/'
os.makedirs(extract_path, exist_ok=True)
!unzip "$zip_path" -d "$extract_path"

Archive:  /content/drive/MyDrive/ULAB/complete_set_of_ATLAS_open_data_samples_July_2016.zip
replace /content/local_data/.gitignore? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
#Checking what branches the zip file had
import os
os.listdir('/content/local_data')

#The main file has three branches, as listed below - Data, MC, and .gitignore. Data and MC are branches containing relevant .root files,
#while .gitignore is a file the computer needs to run - not important to us.

['.gitignore', 'MC', 'Data']

In [None]:
#Checking what is in the 'Data' branch
import os
os.listdir('/content/local_data/Data')

['DataEgamma.root', 'DataMuons.root']

In [None]:
#Variable descriptions of Muon data

!pip install uproot

import uproot

file_path = '/content/local_data/Data/DataMuons.root'

try:
    with uproot.open(file_path) as file:
        if 'mini;1' in file:
            tree = file['mini;1']
            print("Description of variables (branches) in TTree 'mini;1':\n")
            for branch_name in tree.keys():
                try:
                    branch_interpretation = tree[branch_name].interpretation.typename
                    print(f"  - {branch_name}: {branch_interpretation}")
                except Exception as branch_e:
                    print(f"  - {branch_name}: Could not determine type (Error: {branch_e})")
        else:
            print(f"Error: TTree 'mini;1' not found in {file_path}")
except Exception as e:
    print(f"Error opening or processing the file: {e}")

Description of variables (branches) in TTree 'mini;1':

  - runNumber: int32_t
  - eventNumber: int32_t
  - channelNumber: int32_t
  - mcWeight: float
  - pvxp_n: int32_t
  - vxp_z: float
  - scaleFactor_PILEUP: float
  - scaleFactor_ELE: float
  - scaleFactor_MUON: float
  - scaleFactor_BTAG: float
  - scaleFactor_TRIGGER: float
  - scaleFactor_JVFSF: float
  - scaleFactor_ZVERTEX: float
  - trigE: bool
  - trigM: bool
  - passGRL: bool
  - hasGoodVertex: bool
  - lep_n: uint32_t
  - lep_truthMatched: bool[]
  - lep_trigMatched: uint16_t[]
  - lep_pt: float[]
  - lep_eta: float[]
  - lep_phi: float[]
  - lep_E: float[]
  - lep_z0: float[]
  - lep_charge: float[]
  - lep_type: uint32_t[]
  - lep_flag: uint32_t[]
  - lep_ptcone30: float[]
  - lep_etcone20: float[]
  - lep_trackd0pvunbiased: float[]
  - lep_tracksigd0pvunbiased: float[]
  - met_et: float
  - met_phi: float
  - jet_n: uint32_t
  - alljet_n: uint32_t
  - jet_pt: float[]
  - jet_eta: float[]
  - jet_phi: float[]
  - jet_E: 

In [None]:
import uproot
import awkward as ak

file_path = '/content/local_data/Data/DataMuons.root'

branches_to_extract = [
    'lep_type'
]

#Seeing the shape (number of leptons/event) of the data - this shape should be the same for every file, as each file is a different set of data describing the same set of events,
#so this cell only needs to be run once - takes forever to run

try:
    with uproot.open(file_path) as file:
        tree = file['mini;1']
        extracted_data = tree.arrays(branches_to_extract, library='ak')

    row_lengths = [len(row) for row in extracted_data['lep_type']]

except Exception as e:
    print(f"Error processing data: {e}")

one_lep = row_lengths.count(1)
two_lep = row_lengths.count(2)
three_lep = row_lengths.count(3)
four_lep = row_lengths.count(4)
five_lep = row_lengths.count(5)

print(one_lep, two_lep, three_lep, four_lep, five_lep)

#Lepton count frequency:
#One lepton: 6374629 events
#Two leptons: 647126
#Three leptoons: 6215
#Four leptons: 111
#Five leptons: 3

6374629 647126 6215 111 3


In [None]:
#Finding the indices of the events where four leptons were observed
def get_indices_lc(my_list, target_value):
    return [i for i, x in enumerate(my_list) if x == target_value]
four_leps = get_indices_lc(row_lengths, 4)
print(four_leps)

[34389, 43437, 121558, 157944, 177527, 195062, 257924, 301500, 443960, 507361, 642379, 743820, 829867, 845981, 902863, 911298, 1014746, 1186734, 1188124, 1219039, 1331440, 1402560, 1404321, 1426778, 1440851, 1496630, 1565998, 1619915, 1765846, 2014571, 2164503, 2355723, 2417509, 2512054, 2584745, 2651872, 2829777, 2835427, 2856104, 2896298, 2977326, 2978741, 3186580, 3228734, 3256120, 3283872, 3376833, 3394641, 3499880, 3502292, 3564816, 3564974, 3611708, 3619444, 3689319, 4025043, 4028218, 4035525, 4138999, 4289323, 4296621, 4306744, 4339234, 4348409, 4353526, 4373581, 4400993, 4430147, 4441560, 4492788, 4552578, 4595009, 4620502, 4652992, 4664280, 4746014, 4766538, 5016903, 5071256, 5141389, 5346668, 5413264, 5490532, 5533557, 5563519, 5571477, 5656866, 5711158, 5716851, 5740277, 5843607, 5926121, 5943483, 5987501, 6022392, 6205081, 6384116, 6393556, 6397451, 6435272, 6527004, 6621388, 6699498, 6714255, 6787477, 6854117, 6905532, 6915570, 6916495, 6996668, 7024905]


In [None]:
#Code to extract all modules
import numpy as np
import awkward as ak
import uproot
import matplotlib.pyplot as plt
import pandas as pd


file_path = '/content/local_data/Data/DataMuons.root'

branches_to_extract = [
    'lep_pt', 'lep_n', 'lep_truthMatched','lep_trigMatched', 'lep_type', 'lep_charge', 'lep_flag', 'lep_eta', 'lep_phi', 'lep_E', 'lep_z0', 'lep_ptcone30', 'lep_etcone20', 'lep_trackd0pvunbiased', 'lep_tracksigd0pvunbiased', 'met_et', 'met_phi', 'jet_n', 'alljet_n', 'jet_pt', 'jet_eta', 'jet_phi', 'jet_E', 'jet_m', 'jet_jvf', 'jet_trueflav', 'jet_truthMatched', 'jet_SV0', 'jet_MV1'
]

try:
    with uproot.open(file_path) as file:
        tree = file['mini;1']
        extracted_data = tree.arrays(branches_to_extract, library='ak')

    num_events = len(extracted_data)
    fixed_lepton_count = 4
    processed_data_np = {}
    print(num_events)

    for branch_name in branches_to_extract:
        ak_array = extracted_data[branch_name]
        is_jagged_array = ak_array.ndim > 1

        if is_jagged_array:
            padded_ak_array = ak.fill_none(ak.pad_none(ak_array, fixed_lepton_count, clip=True), 0)
            processed_data_np[branch_name] = ak.to_numpy(padded_ak_array)
        else:
            repeated_scalar = np.repeat(ak.to_numpy(ak_array), fixed_lepton_count).reshape(num_events, fixed_lepton_count)
            processed_data_np[branch_name] = repeated_scalar

    print(processed_data_np['lep_pt'][:10])

except Exception as e:
    print(f"Error processing data: {e}")

'''#Making the awkward array into a list and then a dataframe
python_list = ak.to_list(processed_data_np)
df = pd.DataFrame(python_list)

# Flatten the 'lep_pt' data for plotting
lep_pt_flat = df['lep_pt'].apply(lambda x: x[0])'''


ModuleNotFoundError: No module named 'awkward'

In [None]:
!pip install uproot

#Trying to make a dataframe with only values in Anson and Colton's existing dataframe
import numpy as np
import awkward as ak
import uproot
import matplotlib.pyplot as plt
import pandas as pd


file_path = '/content/local_data/Data/DataMuons.root'

branches_to_extract = [
    'lep_charge', 'lep_pt', 'lep_eta','lep_phi', 'lep_E', 'lep_type', 'lep. isTightlD', 'lep_ptcone30', 'lep_ptcone20', 'runNumber', 'met_et'
]
try:
    with uproot.open(file_path) as file:
        tree = file['mini;1']
        extracted_data = tree.arrays(branches_to_extract, library='ak')

    num_events = len(extracted_data)
    fixed_lepton_count = 4
    processed_data_np = {}
    print(num_events)

    for branch_name in branches_to_extract:
        ak_array = extracted_data[branch_name]
        is_jagged_array = ak_array.ndim > 1

        if is_jagged_array:
            padded_ak_array = ak.fill_none(ak.pad_none(ak_array, fixed_lepton_count, clip=True), 0)
            processed_data_np[branch_name] = ak.to_numpy(padded_ak_array)
        else:
            repeated_scalar = np.repeat(ak.to_numpy(ak_array), fixed_lepton_count).reshape(num_events, fixed_lepton_count)
            processed_data_np[branch_name] = repeated_scalar

    print(processed_data_np[:][:10])

except Exception as e:
    print(f"Error processing data: {e}")

'''#Making the awkward array into a list and then a dataframe
python_list = ak.to_list(processed_data_np)
df = pd.DataFrame(python_list)

# Flatten the 'lep_pt' data for plotting
lep_pt_flat = df['lep_pt'].apply(lambda x: x[0])'''


Collecting uproot
  Downloading uproot-5.7.1-py3-none-any.whl.metadata (35 kB)
Collecting awkward>=2.8.2 (from uproot)
  Downloading awkward-2.8.12-py3-none-any.whl.metadata (7.5 kB)
Collecting awkward-cpp==51 (from awkward>=2.8.2->uproot)
  Downloading awkward_cpp-51-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (2.2 kB)
Downloading uproot-5.7.1-py3-none-any.whl (393 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.8/393.8 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading awkward-2.8.12-py3-none-any.whl (913 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m913.7/913.7 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading awkward_cpp-51-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (656 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m656.7/656.7 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: awkward-cpp, awkward, uproot
Successfully 

"#Making the awkward array into a list and then a dataframe\npython_list = ak.to_list(processed_data_np)\ndf = pd.DataFrame(python_list)\n\n# Flatten the 'lep_pt' data for plotting\nlep_pt_flat = df['lep_pt'].apply(lambda x: x[0])"