In [1]:
import os
from pathlib import Path

In [2]:
import pandas as pd

In [3]:
from ROOT import TFile , TH1F , TCanvas

Welcome to JupyROOT 6.26/06


In [4]:
from test_known_histos import best_fit_distribution
from test_known_histos import make_pdf

In [5]:
def get_pdf(data,bins=100):
    best_distibutions = best_fit_distribution(data, bins)
    for dis in best_distibutions:
        if dis[0].name == "invgauss":
            pdf=make_pdf(dis[0],dis[1])
            break
        else: 
            continue
    return pdf

In [6]:
def get_pdf_geninv(data,bins=100):
    best_distibutions = best_fit_distribution(data, bins)
    for dis in best_distibutions:
        if dis[0].name == "geninvgauss":
            pdf=make_pdf(dis[0],dis[1])
            break
        else: 
            continue
    return pdf

In [7]:
def get_pdf_gengamma(data,bins=100):
    best_distibutions = best_fit_distribution(data, bins)
    for dis in best_distibutions:
        if dis[0].name == "gengamma":
            pdf=make_pdf(dis[0],dis[1])
            break
        else: 
            continue
    return pdf

In [8]:
def Fill_h_inv(h,pdf):
    for x in pdf.index:
        h.Fill(1-float(x),float(pdf[x]))
    return None

In [9]:
def Fill_h(h,pdf):
    for x in pdf.index:
        h.Fill(float(x),float(pdf[x]))
    return None

In [10]:
def getall(d):
    "Generator function to recurse into a ROOT file/dir and yield (path, obj) pairs"
    for key in d.GetListOfKeys():
        kname = key.GetName()
        if key.IsFolder():
            # TODO: -> "yield from" in Py3
            for i in getall(d.Get(kname)):
                yield i
        else:
            yield d.Get(kname)

In [11]:
def get_all_dict(file_path):
    hist_dict={}
    f=TFile(file_path)
    for hh in getall(f):
        hh.SetDirectory(0)
        hist_dict.update({hh.GetName():hh})
    return hist_dict

In [12]:
def sum_histos(histo_list):
    result=TH1F("sum","sum",histo_list[0].GetNbinsX(),0.,1.)
    result.SetDirectory(0)
    for histo in histo_list:
        for i in range (histo.GetNbinsX()):
            sum_=result.GetBinContent(i+1)
            sum_+=histo.GetBinContent(i+1)
            result.SetBinContent(i+1,sum_)
            err_=result.GetBinError(i+1)
            err_+=histo.GetBinError(i+1)
            result.SetBinError(i+1,err_)
    return result

In [13]:
def refine_histos(file):
    c1=TCanvas("c1","",800,600)
    c1.SetLogy()
    parent_dir=os.path.dirname(file)
    
    save_dir=parent_dir.replace(
        os.path.basename( folder_out ),
        "05_refined_ML_output"
    )
    save_dir=os.path.relpath(save_dir, os.getcwd())
    
    !mkdir -p {save_dir}
    df = pd.read_csv(file)
    p=Path(file)
    channel=os.path.basename(parent_dir)
    
    for key in refine_rules.keys():
        if key in file:
            hh=get_all_dict(
                file.split(f"_{key}")[0]+".root"
            )
            alghoname=p.stem.split(key)[0]
            name=p.stem.removeprefix(alghoname)+"_"+channel
            integral=hh.get(
                name
            ).Integral()
            
            method=refine_rules.get(key)
            break
        else: 
            pass
    if (method == 1): 
        data = pd.Series(df['scores'][:20000])
        pdf=get_pdf(data)
        
        h=TH1F("h1","",100,0.0,1.0)
        h.SetDirectory(0)
        Fill_h(h,pdf)
    elif (method == 4): 
        data = pd.Series([1.0-score for score in df['scores'][:15000] ])
        pdf=get_pdf(data)
        h=TH1F("h1","",100,0.0,1.0)
        h.SetDirectory(0)
        Fill_h_inv(h,pdf)
    else: 
        data1=[score for score in df['scores'] if score < 0.5]
        data2=[1.-score for score in df['scores'] if score >= 0.5]
        
        ewgt1=float( len(data1) )/float( len(data1) + len(data2) )
        ewgt2=1.0-ewgt1
        
        data1=pd.Series(data1[:20000])
        data2=pd.Series(data2[:20000])
        
        if (method == 2):
            pdf1=get_pdf(data1)
            pdf2=get_pdf_gengamma(data2)
        elif (method == 3):
            pdf1=get_pdf_gengamma(data1)
            pdf2=get_pdf(data2)
            
        h1=TH1F("h1","",100,0.0,1.0)
        h2=TH1F("h2","",100,0.0,1.0)
        
        h1.SetDirectory(0)
        h2.SetDirectory(0)
        
        Fill_h(h1,pdf1)
        Fill_h_inv(h2,pdf2)
        
        h1.Scale(integral*ewgt1/h1.Integral(1,50))
        h2.Scale(integral*ewgt2/h2.Integral(51,100))
        
        h=sum_histos([h1,h2])
    h.SetName(name)
    try:
        h.Scale(integral/h.Integral())
    except:
        print("check the ", name, " histogram")
    h.Draw()
    c1.SaveAs(
        os.path.join(
            save_dir,
            alghoname+"_"+key+".png"
        )
    )
    print(name, integral)
    return h


In [14]:
folder_out=os.path.join(
    os.path.dirname(os.getcwd()),
    "04_ML_classification"
)
path_root = Path(folder_out)
files=[root_file.as_posix() for root_file in path_root.glob('**/*.csv')]

In [15]:
refine_rules={
    "Tau_LQ":3,
    "LQ_LQ":4,
    "ttbar":2,
    "stop": 2,
    "ww": 1,
    "wz": 1,
    "zz":1,
    "w_jets":1, 
    "z_jets":1, 
}

channels=[
    "/b_tau_tau_hadronic",
    "/b_b_tau_tau_hadronic", 
    "/b_tau_tau_semileptonic",
    "/b_b_tau_tau_semileptonic", 
]
masses=range(1000, 2501, 250)
algorithms=["Gradient_Boosting"]


In [16]:
from multiprocessing import Pool

for a in algorithms:
    for ch in channels:
        for mass in masses:
            newfiles=[]
            for file in files:
                if not(f"MLQ_{mass}" in file): continue
                if not(ch in file): continue
                if not(a in file):continue
                newfiles.append(file)
            parent_dir=os.path.dirname(newfiles[0])
            save_dir=parent_dir.replace(
                os.path.basename( folder_out ),
                "05_refined_ML_output"
            )
            
            !mkdir -p {save_dir}
            with Pool(16) as p:
                histos=list(p.map(refine_histos,newfiles))
            f = TFile(
                os.path.join(save_dir,a+".root"),
                "RECREATE"
            )
            [h.Write(h.GetName()) for h in histos]
            print(os.path.join(save_dir,a+".root"))


w_jets_b_tau_tau_hadronic 88197.03344726562
ww_b_tau_tau_hadronic 436.520094871521
wz_b_tau_tau_hadronic 452.2968470752239
z_jets_b_tau_tau_hadronic 98751.5025062561
zz_b_tau_tau_hadronic 518.6491822898388
stop_b_tau_tau_hadronic 18809.981190681458
LQ_LQ_1000_b_tau_tau_hadronic 962.1404032148421
ttbar_b_tau_tau_hadronic 98299.34584999084
Tau_LQ_1000_b_tau_tau_hadronic 833.8082914948463
/home/crisfer2694/Desktop/Pheno_BSM/Leptoquarks_searches/05_refined_ML_output/MLQ_1000/b_tau_tau_hadronic/Gradient_Boosting.root
w_jets_b_tau_tau_hadronic 88197.02899169922
ww_b_tau_tau_hadronic 436.5200833082199
wz_b_tau_tau_hadronic 452.2968417108059
z_jets_b_tau_tau_hadronic 98751.50508117676
zz_b_tau_tau_hadronic 518.6491952240467
stop_b_tau_tau_hadronic 18809.980874061584
LQ_LQ_1250_b_tau_tau_hadronic 160.13214745931327
Tau_LQ_1250_b_tau_tau_hadronic 184.31315745785832
ttbar_b_tau_tau_hadronic 98299.34972190857
/home/crisfer2694/Desktop/Pheno_BSM/Leptoquarks_searches/05_refined_ML_output/MLQ_1250/b_

Info in <TCanvas::Print>: png file ../05_refined_ML_output/MLQ_1000/b_tau_tau_hadronic/Gradient_Boosting__w_jets.png has been created
Info in <TCanvas::Print>: png file ../05_refined_ML_output/MLQ_1000/b_tau_tau_hadronic/Gradient_Boosting__ww.png has been created
Info in <TCanvas::Print>: png file ../05_refined_ML_output/MLQ_1000/b_tau_tau_hadronic/Gradient_Boosting__wz.png has been created
Info in <TCanvas::Print>: png file ../05_refined_ML_output/MLQ_1000/b_tau_tau_hadronic/Gradient_Boosting__z_jets.png has been created
Info in <TCanvas::Print>: png file ../05_refined_ML_output/MLQ_1000/b_tau_tau_hadronic/Gradient_Boosting__zz.png has been created
Info in <TCanvas::Print>: png file ../05_refined_ML_output/MLQ_1000/b_tau_tau_hadronic/Gradient_Boosting__stop.png has been created
Info in <TCanvas::Print>: png file ../05_refined_ML_output/MLQ_1000/b_tau_tau_hadronic/Gradient_Boosting__LQ_LQ.png has been created
Info in <TCanvas::Print>: png file ../05_refined_ML_output/MLQ_1000/b_tau_tau