# Bibliotecas

In [1]:
import sys

sys.path.append('/eos/user/t/thenriqu/Dark_Matter/LGBM_hhdm_analysis/')

import pprint
import json
from pathlib import Path
import pickle

import hepherolib.data as data

from tqdm import tqdm
import tensorflow as tf
from statsmodels.stats.weightstats import DescrStatsW
from tensorflow.keras.models import load_model

from lgbm.controllers_lgb_v2 import LGBLearner, LGBModel

# Disable GPUs
tf.config.set_visible_devices([], 'GPU')

2024-04-12 12:26:16.348184: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-12 12:26:16.642271: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Configuração

In [2]:
period = '17'
year_style = 2017
dataset_year = "2017"
basedir = '/eos/user/t/thenriqu/Dark_Matter/Amostras/hhdmAnalysis_deepJet_Regions/datasets'

# Data folder
dataset_name = basedir.split('/')[-2]
data_path = f"./data/{dataset_name}/{dataset_year}"
Path(data_path).mkdir(parents=True, exist_ok=True)
    
# Setup models folders
models_path = f"./models/{dataset_name}/{dataset_year}"
Path(models_path).mkdir(parents=True, exist_ok=True)

# Lendo metadata

In [3]:
with open("/eos/user/t/thenriqu/Dark_Matter/metadata.json", "r") as f:
    metadata = json.load(f)

ST = metadata.get("datasets").get("ST")
TT = metadata.get("datasets").get("TT")
ZZ = metadata.get("datasets").get("ZZ")
WZ = metadata.get("datasets").get("WZ")
DY = metadata.get("datasets").get("DY")
RESIDUAL = metadata.get("datasets").get("RESIDUAL")
DATA = metadata.get("datasets").get("DATA")

# Lendo datasets

In [4]:
variables = ["RegionID", "evtWeight", "MLP_score_torch", "LeadingLep_pt", "LepLep_pt", "LepLep_deltaR", "LepLep_deltaM", "MET_pt", "MET_LepLep_Mt", "MET_LepLep_deltaPhi", "TrailingLep_pt", "MT2LL", "Nbjets"]
ds = data.read_files(basedir, period, mode="normal", features=variables)

data.join_datasets(ds, "ST", ST.get(period), mode="normal")
data.join_datasets(ds, "TT", TT.get(period), mode="normal")
data.join_datasets(ds, "ZZ", ZZ.get(period), mode="normal")
data.join_datasets(ds, "WZ", WZ.get(period), mode="normal")
data.join_datasets(ds, "DYJetsToLL", DY.get(period), mode="normal")
data.join_datasets(ds, "Residual", RESIDUAL.get(period), mode="normal")

# Datasets to be used
used_datasets = [
    *[dt for dt in ds.keys() if dt.startswith("Signal_")],
    "ST",
    "TT",
    "ZZ",
    "WZ",
    "DYJetsToLL",
    "Residual"
]

for dt_name in used_datasets:
    print(dt_name, ds[dt_name].shape)

# Delete every other dataset
datasets_to_delete = [dt_name for dt_name in ds.keys() if dt_name not in used_datasets]
for dt_name in datasets_to_delete:
    del ds[dt_name]


Loading datasets...


100%|██████████| 73/73 [00:36<00:00,  1.98it/s]


Signal_1000_100 (183832, 13)
Signal_1000_200 (186540, 13)
Signal_1000_300 (186162, 13)
Signal_1000_400 (183883, 13)
Signal_1000_600 (178268, 13)
Signal_1000_800 (149749, 13)
Signal_400_100 (127644, 13)
Signal_400_200 (105645, 13)
Signal_500_100 (147052, 13)
Signal_500_200 (136350, 13)
Signal_500_300 (121847, 13)
Signal_600_100 (156878, 13)
Signal_600_200 (157078, 13)
Signal_600_300 (133327, 13)
Signal_600_400 (129967, 13)
Signal_800_100 (172824, 13)
Signal_800_200 (175289, 13)
Signal_800_300 (174756, 13)
Signal_800_400 (169835, 13)
Signal_800_600 (140947, 13)
ST (74347, 13)
TT (1951619, 13)
ZZ (1400437, 13)
WZ (25638, 13)
DYJetsToLL (6003369, 13)
Residual (974059, 13)


# Modelo metadata

In [5]:
base_model_name = "multi_signal"
features = [
    "LeadingLep_pt",
    "LepLep_deltaM",
    "LepLep_deltaR",
    "LepLep_pt",
    "MET_LepLep_Mt",
    "MET_LepLep_deltaPhi",
    "MET_pt",
    "MT2LL",
    "Nbjets",
    "TrailingLep_pt"
]

# Predict usando LGB

In [6]:
# Load model
lgb_model = LGBModel(model_fpath=f"{models_path}/LGB_{base_model_name}-clf.model")

# Predict each dataset
for dataset_name, dataset in tqdm(ds.items()):
    X_features = dataset[features]
    Y_pred = lgb_model.predict(X_features, features)
    dataset["LGB_score"] = Y_pred

100%|██████████| 26/26 [1:18:07<00:00, 180.30s/it]


# Pedrict usando MLP Keras

In [7]:
# Load model
mlp_model = load_model(f"{models_path}/MLP_{base_model_name}-checkpoint.h5")

# Load zscore stats
zscore = json.load(open(f"{data_path}/MLP_{base_model_name}-weighted_stats.json", "r"))

# Predict each dataset
for dataset_name, dataset in tqdm(ds.items()):
    X_features = dataset[features].copy()
    
    # Since the model was trained under processed data, we need to preprocess it to predict
    for feature in features:
        X_features.loc[:, feature] = (X_features[feature] - zscore[feature]["mean"]) / zscore[feature]["std"]

    Y_pred = mlp_model.predict(X_features, batch_size=256)
    dataset["MLP_score_keras"] = Y_pred

  0%|          | 0/26 [00:00<?, ?it/s]



  4%|▍         | 1/26 [00:22<09:10, 22.01s/it]



  8%|▊         | 2/26 [00:46<09:23, 23.48s/it]



 12%|█▏        | 3/26 [01:09<08:52, 23.17s/it]



 15%|█▌        | 4/26 [01:30<08:08, 22.22s/it]



 19%|█▉        | 5/26 [01:50<07:33, 21.61s/it]



 23%|██▎       | 6/26 [02:10<07:02, 21.13s/it]



 27%|██▋       | 7/26 [02:25<06:02, 19.08s/it]



 31%|███       | 8/26 [02:39<05:14, 17.49s/it]



 35%|███▍      | 9/26 [02:56<04:55, 17.37s/it]



 38%|███▊      | 10/26 [03:13<04:35, 17.22s/it]



 42%|████▏     | 11/26 [03:28<04:08, 16.59s/it]



 46%|████▌     | 12/26 [03:46<03:58, 17.00s/it]



 50%|█████     | 13/26 [04:04<03:44, 17.27s/it]



 54%|█████▍    | 14/26 [04:21<03:25, 17.10s/it]



 58%|█████▊    | 15/26 [04:37<03:05, 16.83s/it]



 62%|██████▏   | 16/26 [04:58<03:01, 18.18s/it]



 65%|██████▌   | 17/26 [05:21<02:55, 19.45s/it]



 69%|██████▉   | 18/26 [05:42<02:39, 19.88s/it]



 73%|███████▎  | 19/26 [06:01<02:18, 19.73s/it]



 77%|███████▋  | 20/26 [06:18<01:54, 19.00s/it]



 81%|████████  | 21/26 [06:27<01:19, 15.87s/it]



 85%|████████▍ | 22/26 [11:07<06:20, 95.04s/it]



 88%|████████▊ | 23/26 [14:37<06:29, 129.71s/it]



 92%|█████████▏| 24/26 [14:40<03:03, 91.67s/it] 



 96%|█████████▌| 25/26 [32:26<06:23, 383.91s/it]



100%|██████████| 26/26 [34:51<00:00, 80.45s/it] 


# Salvar predict datasets

Os plots da previsão serão feitos em outro código, pois a previsão do Keras é lenta

In [8]:
with open(f"{data_path}/{base_model_name}-predicted-data.pickle", "wb") as f:
    pickle.dump(ds, f)

# Notificar quando o Notebook acabar

In [9]:
from IPython.display import clear_output, display, HTML, Javascript

display(Javascript("""
  var msg = new SpeechSynthesisUtterance();
  msg.text = "Processo completo!";
  window.speechSynthesis.speak(msg);
  alert("Processo completo!")
"""))

<IPython.core.display.Javascript object>