# Código de preprocesamiento de la base de datos 'Run2012C_DoubleElectron' y entrenamiento de Árboles de Decisión para la clasificación de leptones dependiendo de si son señales de interés o ruido de fondo.

### Extración de la base de datos de un archivo root

In [None]:
import numpy as np
import h5py
import uproot

treename = 'Events;1'
filename = {}
upfile = {}
params = {}

filename['2012C'] = 'Run2012C_DoubleElectron.root'

upfile['2012C'] = uproot.open(filename['2012C']) #devuelve un AwkwardArray

params['2012C'] = upfile['2012C'][treename].arrays() # devuelve un diccionario de arrays


In [378]:
print(len(params['2012C']))

32537541


### Aplicación de filtrado de características para obtener los eventos físicos provenientes de la desintegración $ H \rightarrow ZZ \rightarrow 4l $ cuyos estados finales son 'FourElectrons' y 'TwoMuonsTwoElectrons'

In [4]:
import awkward as ak

def filter_electrons(array):
    # Filtro: Al menos cuatro electrones
    array = array[array['nElectron'] == 4]

    # Filtro: Buen aislamiento (pfRelIso03_all < 0.40)
    array = array[ak.all(np.absolute(array['Electron_pfRelIso03_all']) < 0.40, axis=1)]

    # Filtro: Buena cinemática para electrones (pt > 7 y |eta| < 2.5)
    array = array[ak.all(array['Electron_pt'] > 7, axis=1)]
    array = array[ak.all(np.absolute(array['Electron_eta']) < 2.5, axis=1)]

    # Definición de nueva columna: Electron_ip3d
    array['Electron_ip3d'] = [[(a**2 + b**2)**0.5 for a, b in zip(x, y)] for x, y in zip(array['Electron_dxy'], array['Electron_dz'])]

    # Definición de nueva columna: Electron_sip3d
    # Itera simultáneamente sobre las tres listas y calcula el resultado componente a componente
    Electron_sip3d_list = []
    for ip3d, dxy_err, dz_err in zip(array['Electron_ip3d'], array['Electron_dxyErr'], array['Electron_dzErr']):
        sip3d_components = [ip3d_val / np.sqrt(dxy_err_val**2 + dz_err_val**2) for ip3d_val, dxy_err_val, dz_err_val in zip(ip3d, dxy_err, dz_err)]
        Electron_sip3d_list.append(sip3d_components)
    array['Electron_sip3d'] = Electron_sip3d_list

    # Filtro: Seguimiento cercano al vértice primario con pequeña incertidumbre
    array = array[ak.all(array['Electron_sip3d'] < 4, axis=1)]
    array = array[ak.all(np.absolute(array['Electron_dxy']) < 0.5, axis=1)]
    array = array[ak.all(np.absolute(array['Electron_dz']) < 1.0, axis=1)]

    # Filtro: Cuatro electrones. Dos positivos y dos negativos
    array = array[(ak.count(array['Electron_charge'], axis=1) == 4) &
                  (ak.sum(array['Electron_charge'] == 1, axis=1) == 2) &
                  (ak.sum(array['Electron_charge'] == -1, axis=1) == 2)]

    return array


In [5]:
array_four_e = filter_electrons(params['2012C'])

In [6]:
len(array_four_e)

49

In [7]:
def pt_cuts(mu_pt, el_pt):
    results = []
    for mu_pt_entry, el_pt_entry in zip(mu_pt, el_pt):
        if len(mu_pt_entry) >= 2 and len(el_pt_entry) >= 2:  # Verificar que haya al menos dos elementos
            mu_pt_sorted = np.sort(mu_pt_entry)[::-1]  # Reverse sort
            el_pt_sorted = np.sort(el_pt_entry)[::-1]  # Reverse sort
            if mu_pt_sorted[0] > 20 and mu_pt_sorted[1] > 10 and el_pt_sorted[0] > 20 and el_pt_sorted[1] > 10:
                results.append(True)
            else:
                results.append(False)
        else:
            results.append(False)  # Si el arreglo está vacío o tiene menos de dos elementos, no se cumplen los cortes
    return results

def dr_cuts(mu_eta, mu_phi, el_eta, el_phi):
    results = []
    for mu_eta_entry, mu_phi_entry, el_eta_entry, el_phi_entry in zip(mu_eta, mu_phi, el_eta, el_phi):
        # Verificar que hay al menos dos elementos en cada arreglo
        if len(mu_eta_entry) < 2 or len(mu_phi_entry) < 2 or len(el_eta_entry) < 2 or len(el_phi_entry) < 2:
            results.append(False)   
        else:
            # Calcular las distancias delta R para cada par de elementos
            mu_dr = np.sqrt((mu_eta_entry[0] - mu_eta_entry[1])**2 + (mu_phi_entry[0] - mu_phi_entry[1])**2)
            el_dr = np.sqrt((el_eta_entry[0] - el_eta_entry[1])**2 + (el_phi_entry[0] - el_phi_entry[1])**2)
            
            # Aplicar la condición de corte a cada par por separado
            mu_pass = mu_dr <= 0.02
            el_pass = el_dr <= 0.02
            
            # Verificar si al menos uno de los pares satisface la condición
            if mu_pass or el_pass:
                results.append(False)
            else:
                results.append(True)
    return results


def filter_array(array):
    array_filtered = array[(array['nElectron'] == 2) & (array['nMuon'] == 2)]
    array_filtered = array_filtered[(ak.all(np.absolute(array_filtered['Electron_eta']) < 2.5, axis=1) 
                                     & ak.all(np.absolute(array_filtered['Muon_eta']) < 2.4, axis=1))]    
    
    array_filtered = array_filtered[pt_cuts(array_filtered['Muon_pt'], array_filtered['Electron_pt'])]
    array_filtered = array_filtered[dr_cuts(array_filtered['Muon_eta'], array_filtered['Muon_phi'],
                                            array_filtered['Electron_eta'], array_filtered['Electron_phi'])]
    
    array_filtered = array_filtered[ak.all(array_filtered['Electron_pfRelIso03_all'] < 0.40, axis=1)]
    array_filtered = array_filtered[ak.all(array_filtered['Muon_pfRelIso04_all'] < 0.40, axis=1)]

    # Definición de nueva columna: Electron_ip3d
    array_filtered['Electron_ip3d'] = [[(a**2 + b**2)**0.5 for a, b in zip(x, y)] 
                                       for x, y in zip(array_filtered['Electron_dxy'], array_filtered['Electron_dz'])]
    Electron_sip3d_list = []
    for ip3d, dxy_err, dz_err in zip(array_filtered['Electron_ip3d'], array_filtered['Electron_dxyErr'], array_filtered['Electron_dzErr']):
        sip3d_components = [ip3d_val / np.sqrt(dxy_err_val**2 + dz_err_val**2) 
                            for ip3d_val, dxy_err_val, dz_err_val in zip(ip3d, dxy_err, dz_err)]
        Electron_sip3d_list.append(sip3d_components)
    array_filtered['Electron_sip3d'] = Electron_sip3d_list
    
    # Definición de nueva columna: Muon_ip3d
    array_filtered['Muon_ip3d'] = [[(a**2 + b**2)**0.5 for a, b in zip(x, y)] 
                                   for x, y in zip(array_filtered['Muon_dxy'], array_filtered['Muon_dz'])]
    Muon_sip3d_list = []
    for ip3d, dxy_err, dz_err in zip(array_filtered['Muon_ip3d'], array_filtered['Muon_dxyErr'], array_filtered['Muon_dzErr']):
        sip3d_components = [ip3d_val / np.sqrt(dxy_err_val**2 + dz_err_val**2) 
                            for ip3d_val, dxy_err_val, dz_err_val in zip(ip3d, dxy_err, dz_err)]
        Muon_sip3d_list.append(sip3d_components)
    array_filtered['Muon_sip3d'] = Muon_sip3d_list
    
    array_filtered = array_filtered[ak.all(array_filtered['Electron_sip3d'] < 4, axis=1)]
    array_filtered = array_filtered[ak.all(array_filtered['Muon_sip3d'] < 4, axis=1)]
    array_filtered = array_filtered[ak.all(np.absolute(array_filtered['Electron_dxy']) < 0.5, axis=1)]
    array_filtered = array_filtered[ak.all(np.absolute(array_filtered['Electron_dz']) < 1.0, axis=1)]
    array_filtered = array_filtered[ak.all(np.absolute(array_filtered['Muon_dxy']) < 0.5, axis=1)]
    array_filtered = array_filtered[ak.all(np.absolute(array_filtered['Muon_dz']) < 1.0, axis=1)]

    condicion = []
    for val_e, val_m in zip(array_filtered['Electron_charge'], array_filtered['Muon_charge']):
        charge_e = 0
        charge_m = 0
        for a in val_e:
            charge_e += a
        for b in val_m:
            charge_m += b
        if charge_e == 0 and charge_m == 0:
            condicion.append(True)
        else:
            condicion.append(False)   
    array_filtered = array_filtered[condicion]
    return array_filtered


In [8]:
array_twotwo = filter_array(params['2012C'])

In [9]:
len(array_twotwo)

74

In [10]:
from scipy.spatial.distance import pdist, squareform
from itertools import combinations

#Reconstrucción de los dos candidatos a bosón Z a partir de los 4 leptones del mismo tipo
def reconstruct_samekind(array):    
    z_mass = 91.2
    idx = np.zeros((2, 2), dtype=int)
    #Genera todas los pares de combinaciones posibles con el número de electrones que hay que serán 4
    pairs = list(combinations(range(len(array['Electron_pt'][0])), 2)) 
    z_idx = [] #lista donde guardar los índices de cada fila para crear una nueva columna en el dataframe
    for pt, eta, phi, mass, charge in zip(array['Electron_pt'],array['Electron_eta'],array['Electron_phi'],
                                          array['Electron_mass'],array['Electron_charge']): 
        # Encuentra el primer par de leptones con la masa invariante más cercana a la masa del bosón Z
        best_mass = -1
        for i1, i2 in pairs:
            if charge[i1] != charge[i2]:
               # Calcular componentes de energía y momento lineal para cada lepton
                energy1 = np.sqrt((pt[i1]*np.cosh(eta[i1]))**2 + mass[i1]**2)
                energy2 = np.sqrt((pt[i2]*np.cosh(eta[i2]))**2 + mass[i2]**2)
                px1 = pt[i1] * np.cos(phi[i1])
                py1 = pt[i1] * np.sin(phi[i1])
                pz1 = pt[i1] * np.sinh(eta[i1])
                px2 = pt[i2] * np.cos(phi[i2])
                py2 = pt[i2] * np.sin(phi[i2])
                pz2 = pt[i2] * np.sinh(eta[i2])

                # Calcular la masa invariante
                this_mass = np.sqrt((energy1 + energy2)**2 - (px1 + px2)**2 - (py1 + py2)**2 - (pz1 + pz2)**2)
                if np.abs(z_mass - this_mass) < np.abs(z_mass - best_mass):
                    best_mass = this_mass
                    best_i1, best_i2 = i1, i2
        
        idx[0] = [best_i1, best_i2]
        # Reconstrucción del segundo bosón Z a partir del par de leptones restantes
        remaining_indices = [i for i in range(4) if i != best_i1 and i != best_i2]
        idx[1] = remaining_indices
        z_idx.append(idx.copy())
    return z_idx

In [11]:
z_idx = reconstruct_samekind(array_four_e)

In [12]:
# Función para calcular la energía y el momento de una partícula a partir de sus componentes
def calcular_energia_momento(pt, eta, phi, mass):
    px = pt * np.cos(phi)
    py = pt * np.sin(phi)
    pz = pt * np.sinh(eta)
    energy = np.sqrt(px**2 + py**2 + pz**2 + mass**2)
    return energy, px, py, pz

# Función para crear los cuadrivectores a partir de las componentes de energía y momento
def crear_cuadrivector(energy, px, py, pz):
    return np.array([energy, px, py, pz])

def calcular_masa_invariante(cuadrivector):
    energy = cuadrivector[0]
    momentum_modulus_squared = np.sum(cuadrivector[1:]**2)
    invariant_mass_squared = energy**2 - momentum_modulus_squared
    invariant_mass = np.sqrt(invariant_mass_squared)
    return invariant_mass

# Función principal para calcular los cuadrivectores de las partículas
def z_fourvectors_samekind(z_idx, array):
    lista_z_fourvecs = []
    z_mass = 91.2
    for fila in range(len(z_idx)):
        pt = array['Electron_pt'][fila]
        eta = array['Electron_eta'][fila]
        phi = array['Electron_phi'][fila]
        mass = array['Electron_mass'][fila]
        z_fourvecs = np.zeros((2, 4))  # Crear arreglo para almacenar cuadrivectores
        for i in range(2):
            i1 = z_idx[fila][i][0]
            i2 = z_idx[fila][i][1]
            energy1, px1, py1, pz1 = calcular_energia_momento(pt[i1], eta[i1], phi[i1], mass[i1])
            energy2, px2, py2, pz2 = calcular_energia_momento(pt[i2], eta[i2], phi[i2], mass[i2])
            cuadrivector = crear_cuadrivector(energy1 + energy2, px1 + px2, py1 + py2, pz1 + pz2)
            z_fourvecs[i] = cuadrivector

        # Ordenar cuadrivectores según proximidad a la masa del bosón Z
        if abs(calcular_masa_invariante(z_fourvecs[0]) - z_mass) < abs(calcular_masa_invariante(z_fourvecs[1]) - z_mass):
            lista_z_fourvecs.append(z_fourvecs)
        else:
            z_fourvecs = z_fourvecs[::-1]
            lista_z_fourvecs.append(z_fourvecs)

    return lista_z_fourvecs

In [13]:
lista_z_fourvecs = z_fourvectors_samekind(z_idx, array_four_e)

In [14]:
# Función para calcular la energía y el momento de una partícula a partir de sus componentes
def calcular_energia_momento(pt, eta, phi, mass):
    px = pt * np.cos(phi)
    py = pt * np.sin(phi)
    pz = pt * np.sinh(eta)
    energy = np.sqrt(px**2 + py**2 + pz**2 + mass**2)
    return energy, px, py, pz

# Función para crear los cuadrivectores a partir de las componentes de energía y momento
def crear_cuadrivector(energy, px, py, pz):
    return np.array([energy, px, py, pz])

def calcular_masa_invariante(cuadrivector):
    energy = cuadrivector[0]
    momentum_modulus_squared = np.sum(cuadrivector[1:]**2)
    invariant_mass_squared = energy**2 - momentum_modulus_squared
    invariant_mass = np.sqrt(invariant_mass_squared)
    return invariant_mass

def z_fourvectors_2el2mu(array):
    lista_z_fourvecs = [] #lista para meter cada cuadruvetcor de cada fila. Habrá dos por fila por cada par de electrones
    z_mass = 91.2
    for fila in range(0,len(array)):
        el_pt = array['Electron_pt'][fila]   
        el_eta = array['Electron_eta'][fila]   
        el_phi = array['Electron_phi'][fila]   
        el_mass = array['Electron_mass'][fila]   
        mu_pt = array['Muon_pt'][fila]   
        mu_eta = array['Muon_eta'][fila]   
        mu_phi = array['Muon_phi'][fila]   
        mu_mass = array['Muon_mass'][fila]  
        z_fourvecs = np.zeros((2,), dtype=object)
        i1 = 0 #índice primer electrón
        i2 = 1 #índice segundo electrón
        i3 = 0 #índice primer muón
        i4 = 1 #índice segundo muón
        energy_e1, px_e1, py_e1, pz_e1 = calcular_energia_momento(el_pt[i1], el_eta[i1], el_phi[i1], el_mass[i1]) 
        energy_e2, px_e2, py_e2, pz_e2 = calcular_energia_momento(el_pt[i2], el_eta[i2], el_phi[i2], el_mass[i2])
        energy_m1, px_m1, py_m1, pz_m1 = calcular_energia_momento(mu_pt[i3], mu_eta[i3], mu_phi[i3], mu_mass[i3])
        energy_m2, px_m2, py_m2, pz_m2 = calcular_energia_momento(mu_pt[i4], mu_eta[i4], mu_phi[i4], mu_mass[i4])
    
        p1 = crear_cuadrivector(energy_e1, px_e1, py_e1, pz_e1)
        p2 = crear_cuadrivector(energy_e2, px_e2, py_e2, pz_e2)
        p3 = crear_cuadrivector(energy_m1, px_m1, py_m1, pz_m1)
        p4 = crear_cuadrivector(energy_m2, px_m2, py_m2, pz_m2)
        
        z_fourvecs = [p1 + p2, p3 + p4]
    
        if abs(calcular_masa_invariante(z_fourvecs[0]) - z_mass) < abs(calcular_masa_invariante(z_fourvecs[1]) - z_mass):
            lista_z_fourvecs.append(z_fourvecs)
        else:
            z_fourvecs = z_fourvecs[::-1]
            lista_z_fourvecs.append(z_fourvecs)
    return lista_z_fourvecs

In [15]:
z_fourvectors_2el2mu(array_twotwo)

[[array([108.36318375,   3.4246363 ,  60.97706795,  13.0512085 ]),
  array([ 40.44518729,  -2.72546488, -32.82552624, -23.45178795])],
 [array([169.10444747, -35.9159317 , -41.90971947, 111.03004837]),
  array([221.80713034,  33.50637817,   2.87303162, 117.90025139])],
 [array([239.5325309 , 111.35190964, 101.17073441, 165.13313675]),
  array([152.62833087, -77.48020744, -65.12147141, -97.03979111])],
 [array([ 41.12975605,   1.31774786, -11.11233425, -14.88933778]),
  array([230.00702284,   8.51214707, -28.39655113, 137.19937897])],
 [array([118.15534522,  61.12328148, -22.87919044,  34.23385239]),
  array([125.50988892, -29.41338444,  67.27787018, -50.50575733])],
 [array([ 419.98440737,   99.76788235,   95.39460373, -366.21911621]),
  array([ 64.94620249, -44.36943245, -29.75972462, -33.67890167])],
 [array([246.7610134 , -23.44664669,  61.28129387, 230.87184143]),
  array([ 143.95972044,  -41.69729614, -117.57784271,  -71.82531548])],
 [array([215.10065306, -94.29147148,   5.646743

In [16]:
lista_z_fourvecs2 = z_fourvectors_2el2mu(array_twotwo)

In [17]:
def filter_deltar(z_idx, array):
    filtro = []
    for fila in range(0, len(z_idx)):
        eta = array['Electron_eta'][fila]
        phi = array['Electron_phi'][fila]
        found_pair = False  # Variable para rastrear si se encontró un par que cumple con la condición
        for i in range(2):
            i1 = z_idx[fila][i][0]
            i2 = z_idx[fila][i][1]
            dr = np.sqrt((eta[i1] - eta[i2])**2 + (phi[i1] - phi[i2])**2)
            if dr < 0.02 :    
                found_pair = True
        if found_pair:
            filtro.append(False)
        else:
            filtro.append(True)
    return filtro

In [18]:
filtro = filter_deltar(z_idx, array_four_e)

In [19]:
# Aplico esas funciones a mis arrays
array_four_e['z_fourvecs'] = lista_z_fourvecs
array_four_e = array_four_e[filtro]

In [20]:
len(array_four_e)

45

In [21]:
array_twotwo['z_fourvecs'] = lista_z_fourvecs2

In [22]:
len(array_twotwo)

74

In [23]:
def calcular_masa_invariante(cuadrivector):
    energy = cuadrivector[0]
    momentum_modulus_squared = np.sum(cuadrivector[1:]**2)
    invariant_mass_squared = energy**2 - momentum_modulus_squared
    invariant_mass = np.sqrt(invariant_mass_squared)
    return invariant_mass

# Aplicar corte en las masas reconstruidas de Z
def cut(array):
    cut = []
    for i in range(0, len(array)):
        if (calcular_masa_invariante(array['z_fourvecs'][i][0]) > 40) and (calcular_masa_invariante(array['z_fourvecs'][i][0]) < 120) and (calcular_masa_invariante(array['z_fourvecs'][i][1])> 12) and (calcular_masa_invariante(array['z_fourvecs'][i][1]) < 120):
            cut.append(True)
        else:
            cut.append(False)
    return cut

In [24]:
cut_four_e = cut(array_four_e)

In [25]:
prueba1 = array_four_e[cut_four_e]

In [26]:
len(prueba1)

25

In [27]:
cut_twotwo = cut(array_twotwo)

In [28]:
prueba2 = array_twotwo[cut_twotwo]

In [29]:
len(prueba2)

44

In [30]:
len(prueba1.fields)

35

In [31]:
len(prueba2.fields)

37

In [32]:
# Campos que quiero conservar
campos = prueba1.fields[0:32]
array1 = prueba1[campos]
array2 = prueba2[campos]

In [33]:
len(array1.fields)

32

In [34]:
len(array2.fields)

32

# Modelo de IA

## Transformo cada array a un dataframe

In [35]:
import pandas as pd
records_list = []
for i in range(0, len(array1)):
    records_list.append(array1[i])

# Obtener el DataFrame de Pandas
df_F_e = pd.DataFrame(records_list)

#Esto ha creado una sola columna en el dataframe con cada array Record.

In [36]:
len(df_F_e)

25

In [37]:
fields = []
for campo in df_F_e.iloc[0][0].fields:
    fields.append(campo)
print(fields)

['run', 'luminosityBlock', 'event', 'PV_npvs', 'PV_x', 'PV_y', 'PV_z', 'nMuon', 'Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge', 'Muon_pfRelIso03_all', 'Muon_pfRelIso04_all', 'Muon_dxy', 'Muon_dxyErr', 'Muon_dz', 'Muon_dzErr', 'nElectron', 'Electron_pt', 'Electron_eta', 'Electron_phi', 'Electron_mass', 'Electron_charge', 'Electron_pfRelIso03_all', 'Electron_dxy', 'Electron_dxyErr', 'Electron_dz', 'Electron_dzErr', 'MET_pt', 'MET_phi']


In [38]:
lista_filas = [] #para cada evento, los diccionarios van en componentes distintas de la lista
for i in range(0, len(df_F_e)):
    values = {} #guardo en forma de diccionario cada campo con su valor
    for campo in fields:
        valor_campo = df_F_e.iloc[i][0][campo]
        values[campo]=valor_campo
    lista_filas.append(values)
df_F_e = pd.DataFrame(lista_filas)

In [39]:
df_F_e

Unnamed: 0,run,luminosityBlock,event,PV_npvs,PV_x,PV_y,PV_z,nMuon,Muon_pt,Muon_eta,...,Electron_phi,Electron_mass,Electron_charge,Electron_pfRelIso03_all,Electron_dxy,Electron_dxyErr,Electron_dz,Electron_dzErr,MET_pt,MET_phi
0,199569,90,111437792,19,0.072789,0.05694,-5.848635,0,[],[],...,"[0.96156156, 1.8035637, 2.663666, -1.0629964]","[0.011734562, 0.007448883, -0.0039281095, 0.00...","[-1, 1, 1, -1]","[0.12949017, 0.08337985, 0.07604551, 0.03476996]","[-0.0015448913, -0.0033373355, 0.0037292824, -...","[0.0020910976, 0.0032436887, 0.0034847362, 0.0...","[0.0016729842, 0.0010503718, -0.0019625416, -0...","[0.0027115957, 0.0017110553, 0.0033536742, 0.0...",26.754982,-0.765865
1,198485,164,174781164,19,0.071224,0.063993,1.627592,0,[],[],...,"[1.9966652, 1.6679968, 1.9920027, -1.6065717]","[-0.001556001, -0.027434915, -0.011324021, 0.0...","[1, 1, -1, -1]","[0.07580596, 0.2913002, 0.010462297, 0.23648414]","[-0.010870595, -0.0010292383, 0.008548134, 0.0...","[0.0064725457, 0.0029752105, 0.003899235, 0.01...","[0.0073316013, -0.0015844578, 0.0014440937, -0...","[0.005192332, 0.008086061, 0.0047288244, 0.009...",39.329945,-0.408865
2,200991,63,98100215,20,0.070069,0.064844,4.01304,0,[],[],...,"[2.962019, -1.1258898, -0.59245396, 2.366343]","[0.015610503, -0.010785984, -0.008179897, 0.03...","[1, -1, 1, -1]","[0.029544499, 0.24546444, 0.09472476, 0.017060...","[0.0011224358, 0.00013478998, -0.0029456594, 0...","[0.0037127524, 0.0034437005, 0.0030963647, 0.0...","[-0.0007582277, -0.004073264, -0.0006949953, 0...","[0.0021406028, 0.004801067, 0.001996092, 0.004...",13.257597,1.632409
3,202016,173,191382222,17,0.068171,0.06373,-2.194838,0,[],[],...,"[-0.5493524, -1.8318202, -2.6454604, 0.9509678]","[-0.006848459, -0.07272212, -0.018416516, -0.0...","[1, 1, -1, -1]","[0.20863907, 0.09663167, 0.02042871, 0.017112002]","[0.0016890017, 0.00015105699, 0.004154351, -0....","[0.0035683562, 0.0033460893, 0.004749284, 0.00...","[-0.0004386099, -0.0049993764, -0.013777835, -...","[0.002464336, 0.004241985, 0.0036847752, 0.001...",20.016258,-0.630477
4,202299,304,421267699,16,0.067006,0.066837,-5.399827,0,[],[],...,"[2.7275105, -2.8577743, -0.69263506, -0.23799235]","[0.018055923, 0.006589511, -0.006802098, 0.011...","[-1, 1, 1, -1]","[0.12592168, 0.13950656, 0.033174485, 0.09225648]","[-0.000945105, 0.0022477407, 0.0017914214, 0.0...","[0.0030123924, 0.0047502513, 0.0029534218, 0.0...","[-0.005683544, -0.0029475142, -0.00069543015, ...","[0.004509489, 0.0051081534, 0.0024546776, 0.00...",26.170612,0.563151
5,199752,185,232018566,15,0.068317,0.058723,-0.647474,0,[],[],...,"[2.362671, 0.522768, 0.35006812, 3.092115]","[0.009236022, 0.021395657, 0.021423178, -0.040...","[-1, -1, 1, 1]","[0.023637341, 0.017293852, 0.08889687, 0.09120...","[0.00030241068, -0.000556906, -0.009526097, -0...","[0.0017024195, 0.0011160973, 0.009064844, 0.00...","[-0.0013319541, 0.0016889393, -0.0173012, -0.0...","[0.0019042864, 0.001877018, 0.016339207, 0.005...",26.009043,-2.029784
6,198271,181,232701194,19,0.073388,0.058999,-4.442595,0,[],[],...,"[-3.04194, -0.9165002, 1.8280592, -0.1549728]","[-0.012981186, -0.0038572242, 0.0041090366, -0...","[-1, 1, -1, 1]","[0.31222993, 0.14818723, 0.092487894, 0.03989743]","[-0.0059970305, 0.003746134, -0.0051013245, 0....","[0.0033614219, 0.0029056193, 0.0029958927, 0.0...","[-0.0005580287, 0.00074859476, 0.0005526357, -...","[0.0021534073, 0.004257845, 0.0036210786, 0.00...",8.041843,3.058743
7,201727,83,43697222,28,0.071895,0.065388,-2.626129,0,[],[],...,"[2.2064042, 3.0056956, -0.109743804, -1.3270572]","[0.026867729, -0.024600595, 0.034623925, 0.033...","[-1, 1, 1, -1]","[0.14243177, 0.02956421, 0.057202604, 0.06375828]","[-0.0011487455, 0.0013219655, 0.004112672, 0.0...","[0.0023638287, 0.0022839992, 0.004743793, 0.00...","[-0.0003336253, 0.00035378238, 0.008607806, -0...","[0.0026682334, 0.003032682, 0.0036503796, 0.00...",9.244429,1.56781
8,202016,505,559670208,9,0.072179,0.063655,-5.42869,0,[],[],...,"[-2.2693105, -1.6489106, 1.5470772, -0.1387016]","[0.0025421015, -0.09956186, -0.008273011, 0.01...","[-1, -1, 1, 1]","[0.0787406, 0.03108487, 0.03385407, 0.010874049]","[7.6891185e-05, 0.0002609892, 0.0020354565, -0...","[0.005813858, 0.0045771324, 0.0012113326, 0.01...","[0.00072367245, -0.001670503, 0.0020320895, 0....","[0.003799094, 0.0046813837, 0.0034233038, 0.01...",12.871797,-0.694919
9,199754,933,873005082,6,0.074131,0.062614,0.655014,0,[],[],...,"[-2.560593, 2.7572381, -1.4300259, 2.3567648]","[-0.008313646, -0.0029151845, 0.047894314, -0....","[1, 1, -1, -1]","[0.030325737, 0.033246607, 0.034936085, 0.0314...","[0.003248945, 0.0006134446, -0.00023499104, -0...","[0.003275009, 0.0027124432, 0.0036452336, 0.00...","[-0.0007522463, 0.000975339, -0.0016014628, -0...","[0.0030698609, 0.0024401208, 0.004395149, 0.00...",22.634716,0.480767


In [40]:
import pandas as pd
records_list = []
for i in range(0, len(array2)):
    records_list.append(array2[i])

# Obtener el DataFrame de Pandas
df_twotwo = pd.DataFrame(records_list)

In [41]:
len(df_twotwo)

44

In [42]:
lista_filas = [] #para cada evento, los diccionarios van en componentes distintas de la lista
for i in range(0, len(df_twotwo)):
    values = {} #guardo en forma de diccionario cada campo con su valor
    for campo in fields:
        valor_campo = df_twotwo.iloc[i][0][campo]
        values[campo]=valor_campo
    lista_filas.append(values)
df_twotwo = pd.DataFrame(lista_filas)

In [43]:
df_twotwo

Unnamed: 0,run,luminosityBlock,event,PV_npvs,PV_x,PV_y,PV_z,nMuon,Muon_pt,Muon_eta,...,Electron_phi,Electron_mass,Electron_charge,Electron_pfRelIso03_all,Electron_dxy,Electron_dxyErr,Electron_dz,Electron_dzErr,MET_pt,MET_phi
0,201824,229,201387532,10,0.07123,0.065763,0.241662,2,"[61.416035, 94.8288]","[1.4180712, 0.46566793]",...,"[-1.8024496, -2.72887]","[0.009410903, -0.03955649]","[1, -1]","[0.16025461, 0.023299519]","[0.002170998, -0.00170906]","[0.0024090442, 0.0047590593]","[0.0013728273, 0.0005160501]","[0.0015431739, 0.003474196]",20.880842,0.2456
1,202504,1191,1394353502,10,0.069873,0.065985,5.496422,2,"[20.24906, 81.04507]","[-0.6158583, 0.5569468]",...,"[-2.200956, 1.7666718]","[-0.006864649, 0.014243599]","[-1, 1]","[0.2689218, 0.009415728]","[-0.00044696903, 0.0011483773]","[0.004571324, 0.0010675259]","[0.0010996618, -0.0024221228]","[0.0021489884, 0.0017066037]",15.25055,1.270668
2,202237,973,1294338754,26,0.069027,0.063396,-4.145526,2,"[96.17201, 38.112934]","[1.0451275, 1.0566844]",...,"[1.9838408, -1.3968074]","[-0.013536485, 0.025465673]","[1, -1]","[0.092702, 0.1310139]","[0.0013310182, -0.00043659454]","[0.0032536734, 0.0063734828]","[-0.001251074, -0.009112619]","[0.0031144354, 0.0041969595]",30.70727,-1.457306
3,202973,839,779714361,6,0.070262,0.061147,-1.260335,2,"[46.851303, 91.316635]","[1.8607856, 2.3781846]",...,"[-2.685153, 2.5815997]","[-0.002770614, 0.016541647]","[1, -1]","[0.011016267, 0.20453137]","[0.00010899053, -0.0009430673]","[0.0030177026, 0.0016887793]","[0.0033147645, 0.0013885021]","[0.0026343178, 0.0016960044]",16.878271,-0.230254
4,200075,377,451096187,25,0.072787,0.063041,-1.004149,2,"[21.980883, 106.27521]","[-1.0905871, -0.7602692]",...,"[-1.7233747, -2.049987]","[0.03460502, 0.056507867]","[-1, 1]","[0.14359775, 0.21280064]","[-0.0050730356, 0.0030114169]","[0.0028250145, 0.0040832735]","[-0.0024024306, -0.0020519847]","[0.0073669436, 0.005048519]",26.620871,-2.009348
5,199834,655,453575003,4,0.067421,0.066282,-0.459162,2,"[14.486348, 78.47348]","[1.4511503, 1.1662259]",...,"[1.346947, -1.7775408]","[-0.0041965204, -0.032870207]","[-1, 1]","[-999.0, -999.0]","[-0.0043439893, 0.004064063]","[0.0036197272, 0.0039941785]","[0.0010041865, -0.0008349159]","[0.0029324284, 0.0038874738]",46.753452,-1.793173
6,202054,394,319813054,7,0.07414,0.062088,0.758021,2,"[50.956432, 26.73977]","[-1.6636244, -0.5267443]",...,"[1.0148101, -1.6047208]","[-0.04205323, -0.01037994]","[1, -1]","[-999.0, 0.04035059]","[-0.015638506, 0.024262665]","[0.009277824, 0.013018387]","[-0.07029707, 0.0016132771]","[0.020361451, 0.006489364]",15.272008,-0.123964
7,198063,241,167209826,9,0.075359,0.063571,-7.735391,2,"[80.36255, 32.31814]","[1.580275, 1.0471685]",...,"[2.1010349, -0.9034737]","[-0.048710514, -0.009869043]","[1, -1]","[0.05735381, -999.0]","[-0.001542772, -0.0035114519]","[0.0027428176, 0.006699236]","[-0.0026041379, 0.012483935]","[0.003145845, 0.005346726]",22.886925,-1.110411
8,203002,469,630169854,13,0.072415,0.062341,0.872079,2,"[60.373962, 30.462318]","[-1.6371706, -0.47975442]",...,"[-1.2987359, 2.960809]","[-0.011964623, -0.024189595]","[-1, 1]","[0.12001982, 0.14256804]","[-0.0013662032, 0.004244038]","[0.005234805, 0.0016130313]","[-0.0051145265, -0.0027803078]","[0.0043687215, 0.0020079145]",58.059853,-0.659696
9,202299,449,627007942,10,0.072742,0.067406,-2.138841,2,"[26.889238, 12.578334]","[-2.0818765, -2.3475618]",...,"[1.1890678, -2.4629617]","[-0.0071562794, 0.00548877]","[-1, 1]","[0.0545241, 0.34863025]","[0.0032784368, 0.003944135]","[0.0023382718, 0.0058323992]","[-0.0024082367, -0.003180158]","[0.0012402851, 0.00482027]",12.418715,-1.421616


In [44]:
df_F_e.to_csv('FourElectrons.csv',index=False)

In [45]:
df_twotwo.to_csv('TwoMuonsTwoElectrons.csv', index=False)

## Selecciono datos que no pertenecen a estos grupos de forma aleatoria y etiqueto los datos

### Primero elijo eventos de fondo de forma aletaoria para el caso de FourElectrons

In [237]:
import random
random.seed(7)
# Elegir de forma aleatoria 100 índices únicos dentro del rango de la longitud del array awkward
indices_aleatorios = random.sample(range(len(params['2012C'])), 100)

# Obtener los 76 arrays record correspondientes a los índices aleatorios
arrays_aleatorios1 = [params['2012C'][idx] for idx in indices_aleatorios]

In [238]:
len(arrays_aleatorios1)

100

In [245]:
import pandas as pd
records_list = []
for i in range(0, len(arrays_aleatorios1)):
    records_list.append(arrays_aleatorios1[i])

# Obtener el DataFrame de Pandas
df_fondo1 = pd.DataFrame(records_list)

In [246]:
lista_filas = [] 
for i in range(0, len(df_fondo1)):
    values = {} #guardo en forma de diccionario cada campo con su valor
    for campo in fields:
        valor_campo = df_fondo1.iloc[i][0][campo]
        values[campo]=valor_campo
    lista_filas.append(values)
df_fondo1 = pd.DataFrame(lista_filas)

### Ahora elijo eventos de fondo de forma aleatoria para el caso TwoMuonsTwoElectrons

In [235]:
import random
random.seed(5)
# Elegir de forma aleatoria 100 índices únicos dentro del rango de la longitud del array awkward
indices_aleatorios = random.sample(range(len(params['2012C'])), 100)

# Obtener los 56 arrays record correspondientes a los índices aleatorios
arrays_aleatorios2 = [params['2012C'][idx] for idx in indices_aleatorios]

In [236]:
len(arrays_aleatorios2)

100

In [243]:
import pandas as pd
records_list = []
for i in range(0, len(arrays_aleatorios2)):
    records_list.append(arrays_aleatorios2[i])

# Obtener el DataFrame de Pandas
df_fondo2 = pd.DataFrame(records_list)

In [244]:
lista_filas = [] 
for i in range(0, len(df_fondo2)):
    values = {} #guardo en forma de diccionario cada campo con su valor
    for campo in fields:
        valor_campo = df_fondo2.iloc[i][0][campo]
        values[campo]=valor_campo
    lista_filas.append(values)
df_fondo2 = pd.DataFrame(lista_filas)

### Ahora uno df_F_e con df_fondo1 y df_twotwo con df_fondo2 etiquetando con 0 a los eventos de señal y con 1 a los de fondo

In [204]:
#Etiqueto ambos df: 0 para Higgs y 1 para fondo y lo guardo todo en un mismo df
def unir_dataframes(df_Higgs, df_fondo):
    # Agregar la columna 'origin' a cada DataFrame con el valor correspondiente
    df_Higgs['origin'] = 0
    df_fondo['origin'] = 1
    
    # Unir los DataFrames en uno solo
    df = pd.concat([df_Higgs, df_fondo], ignore_index=True)
    
    return df

In [205]:
df1 = unir_dataframes(df_F_e, df_fondo1)

df2 = unir_dataframes(df_twotwo, df_fondo2)

In [207]:
df1.to_csv('FourElectrons_con_ruido.csv', index=False)
df2.to_csv('TwoMuonsTwoElectrons_con_ruido.csv', index=False)

## Compruebo cuál es el mejor modelo de machine learning para cada caso usando RapidMiner Studio y entreno el modelo seleccionado (Árbol de Decisión)

### Primer intento (fallido): Entreno el modelo mediante dataframes con tantas columnas como el número máximo de leptones y muones registrados en todas las filas. Se evita así trabajar con arrays.

¡Predicciones poco eficientes en otra base de datos!

### Caso FourElectrons

In [208]:
columns_to_process = ['Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge', 'Muon_dxy', 'Muon_dxyErr', 'Muon_dz', 'Muon_dzErr', 'Muon_pfRelIso03_all', 'Muon_pfRelIso04_all', 'Electron_pt', 'Electron_eta', 'Electron_phi', 'Electron_mass', 'Electron_charge', 'Electron_dxy', 'Electron_dxyErr', 'Electron_dz', 'Electron_dzErr', 'Electron_pfRelIso03_all', 'MET_pt', 'MET_phi']

for index, row in df1.iterrows():
    if row['nMuon'] > 0:
        for i in range(1, row['nMuon'] + 1):
            for column in columns_to_process:
                if column.startswith('Muon'):
                    new_column_name = f"{column}_{i}"
                    values = row[column][i-1]  # Acceder al i-ésimo elemento de la lista
                    df1.at[index, new_column_name] = float(values)
    if row['nElectron'] > 0:
        for i in range(1, row['nElectron'] + 1):
            for column in columns_to_process:
                if column.startswith('Electron'):
                    new_column_name = f"{column}_{i}"
                    values = row[column][i-1]  # Acceder al i-ésimo elemento de la lista
                    df1.at[index, new_column_name] = float(values)

# Eliminar las columnas originales que ya han sido divididas
df1.drop(columns=columns_to_process, inplace=True)

  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_column_name] = float(values)
  df1.at[index, new_

In [209]:
#Los valores faltantes son transformados a 0. No se pueden eliminar porque todas las filas tienen algún valor faltante.
df1 = df1.fillna(0)

In [211]:
# Se deben elegir unas columnas en concreto para que al aplicar el modelo entrenado al nuevo dataframe, este cuente con las mismas. 
# Esto limita mucho la capacidad de aprendizaje.
X_columns = ['run', 'luminosityBlock', 'event', 'PV_npvs', 'PV_x', 'PV_y', 'PV_z',
                       'nMuon', 'nElectron','Electron_pt_1', 'Electron_eta_1',
                       'Electron_phi_1', 'Electron_mass_1', 'Electron_charge_1',
                       'Electron_dxy_1', 'Electron_dxyErr_1', 'Electron_dz_1',
                       'Electron_dzErr_1', 'Electron_pfRelIso03_all_1', 'Electron_pt_2',
                       'Electron_eta_2', 'Electron_phi_2', 'Electron_mass_2',
                       'Electron_charge_2', 'Electron_dxy_2', 'Electron_dxyErr_2',
                       'Electron_dz_2', 'Electron_dzErr_2', 'Electron_pfRelIso03_all_2',
                       'Electron_pt_3', 'Electron_eta_3', 'Electron_phi_3', 'Electron_mass_3',
                       'Electron_charge_3', 'Electron_dxy_3', 'Electron_dxyErr_3',
                       'Electron_dz_3', 'Electron_dzErr_3', 'Electron_pfRelIso03_all_3',
                       'Electron_pt_4', 'Electron_eta_4', 'Electron_phi_4', 'Electron_mass_4',
                       'Electron_charge_4', 'Electron_dxy_4', 'Electron_dxyErr_4',
                       'Electron_dz_4', 'Electron_dzErr_4', 'Electron_pfRelIso03_all_4',
                       'Muon_pt_1', 'Muon_eta_1', 'Muon_phi_1', 'Muon_mass_1', 'Muon_charge_1',
                       'Muon_dxy_1', 'Muon_dxyErr_1', 'Muon_dz_1', 'Muon_dzErr_1',
                       'Muon_pfRelIso03_all_1', 'Muon_pfRelIso04_all_1', 'Muon_pt_2',
                       'Muon_eta_2', 'Muon_phi_2', 'Muon_mass_2', 'Muon_charge_2',
                       'Muon_dxy_2', 'Muon_dxyErr_2', 'Muon_dz_2', 'Muon_dzErr_2',
                       'Muon_pfRelIso03_all_2', 'Muon_pfRelIso04_all_2', 'Muon_pt_3',
                       'Muon_eta_3', 'Muon_phi_3', 'Muon_mass_3', 'Muon_charge_3',
                       'Muon_dxy_3', 'Muon_dxyErr_3', 'Muon_dz_3', 'Muon_dzErr_3',
                       'Muon_pfRelIso03_all_3', 'Muon_pfRelIso04_all_3', 'Muon_pt_4',
                       'Muon_eta_4', 'Muon_phi_4', 'Muon_mass_4', 'Muon_charge_4',
                       'Muon_dxy_4', 'Muon_dxyErr_4', 'Muon_dz_4', 'Muon_dzErr_4',
                       'Muon_pfRelIso03_all_4', 'Muon_pfRelIso04_all_4']


In [214]:
len(df1.keys())

137

In [215]:
len(X_columns)

93

In [216]:
# Definir las características (X) y las etiquetas (y)

# Usar todas las columnas excepto 'Origin' como características
X = df1[X_columns].values

# Usar la columna 'Origin' como la clase original (etiquetas)
y = df1['origin'].values

In [217]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
import graphviz
from sklearn.metrics import classification_report, confusion_matrix
import joblib


# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el clasificador de árbol de decisión
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X_train, y_train)
# Guardar el modelo entrenado en un archivo
joblib.dump(tree_clf, 'modelo_arbol_decision.pkl')

# Predicciones en el conjunto de prueba
y_pred = tree_clf.predict(X_test)

# Métricas de evaluación
print("Matriz de Confusión:")
print(confusion_matrix(y_test, y_pred))
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred))

Matriz de Confusión:
[[  3   0]
 [  0 202]]

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00       202

    accuracy                           1.00       205
   macro avg       1.00      1.00      1.00       205
weighted avg       1.00      1.00      1.00       205



### Caso TwoMuonsTwoElectrons

In [218]:
columns_to_process = ['Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge', 'Muon_dxy', 'Muon_dxyErr', 'Muon_dz', 'Muon_dzErr', 'Muon_pfRelIso03_all', 'Muon_pfRelIso04_all', 'Electron_pt', 'Electron_eta', 'Electron_phi', 'Electron_mass', 'Electron_charge', 'Electron_dxy', 'Electron_dxyErr', 'Electron_dz', 'Electron_dzErr', 'Electron_pfRelIso03_all', 'MET_pt', 'MET_phi']

for index, row in df2.iterrows():
    if row['nMuon'] > 0:
        for i in range(1, row['nMuon'] + 1):
            for column in columns_to_process:
                if column.startswith('Muon'):
                    new_column_name = f"{column}_{i}"
                    values = row[column][i-1]  # Acceder al i-ésimo elemento de la lista
                    df2.at[index, new_column_name] = float(values)
    if row['nElectron'] > 0:
        for i in range(1, row['nElectron'] + 1):
            for column in columns_to_process:
                if column.startswith('Electron'):
                    new_column_name = f"{column}_{i}"
                    values = row[column][i-1]  # Acceder al i-ésimo elemento de la lista
                    df2.at[index, new_column_name] = float(values)

# Eliminar las columnas originales que ya han sido divididas
df2.drop(columns=columns_to_process, inplace=True)

In [219]:
#Los valores faltantes los transformo a 0. No puedo eliminarlos porque todas las filas tienen
df2 = df2.fillna(0)

In [220]:
len(df2.keys())

94

In [221]:
X_columns2 = ['run', 'luminosityBlock', 'event', 'PV_npvs', 'PV_x', 'PV_y', 'PV_z',
           'nMuon', 'nElectron', 'Muon_pt_1', 'Muon_eta_1', 'Muon_phi_1',
           'Muon_mass_1', 'Muon_charge_1', 'Muon_dxy_1', 'Muon_dxyErr_1',
           'Muon_dz_1', 'Muon_dzErr_1', 'Muon_pfRelIso03_all_1',
           'Muon_pfRelIso04_all_1', 'Muon_pt_2', 'Muon_eta_2', 'Muon_phi_2',
           'Muon_mass_2', 'Muon_charge_2', 'Muon_dxy_2', 'Muon_dxyErr_2',
           'Muon_dz_2', 'Muon_dzErr_2', 'Muon_pfRelIso03_all_2',
           'Muon_pfRelIso04_all_2', 
            'Electron_pt_1', 'Electron_eta_1',
           'Electron_phi_1', 'Electron_mass_1', 'Electron_charge_1',
           'Electron_dxy_1', 'Electron_dxyErr_1', 'Electron_dz_1',
           'Electron_dzErr_1', 'Electron_pfRelIso03_all_1', 'Electron_pt_2',
           'Electron_eta_2', 'Electron_phi_2', 'Electron_mass_2',
           'Electron_charge_2', 'Electron_dxy_2', 'Electron_dxyErr_2',
           'Electron_dz_2', 'Electron_dzErr_2', 'Electron_pfRelIso03_all_2',
            'Electron_pt_3', 'Electron_eta_3', 'Electron_phi_3', 'Electron_mass_3',
            'Electron_charge_3', 'Electron_dxy_3', 'Electron_dxyErr_3',
            'Electron_dz_3', 'Electron_dzErr_3', 'Electron_pfRelIso03_all_3',
            'Electron_pt_4', 'Electron_eta_4', 'Electron_phi_4', 'Electron_mass_4',
            'Electron_charge_4', 'Electron_dxy_4', 'Electron_dxyErr_4',
            'Electron_dz_4', 'Electron_dzErr_4', 'Electron_pfRelIso03_all_4']


In [224]:
# Definir las características (X) y las etiquetas (y)

# Usar todas las columnas excepto 'E' como características
X = df2[X_columns2].values

# Usar la columna 'E' como la clase original (etiquetas)
y = df2['origin'].values

In [225]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
import graphviz
from sklearn.metrics import classification_report, confusion_matrix
import joblib


# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el clasificador de árbol de decisión
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X_train, y_train)
# Guardar el modelo entrenado en un archivo
joblib.dump(tree_clf, 'modelo_arbol_decision_2.pkl')

# Predicciones en el conjunto de prueba
y_pred = tree_clf.predict(X_test)

# Métricas de evaluación
print("Matriz de Confusión:")
print(confusion_matrix(y_test, y_pred))
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred))


Matriz de Confusión:
[[  8   0]
 [  0 201]]

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      1.00      1.00       201

    accuracy                           1.00       209
   macro avg       1.00      1.00      1.00       209
weighted avg       1.00      1.00      1.00       209



### Segundo intento (fallido): ENTRENAMIENTO DEL MODELO MEDIANTE NORMALIZACIÓN DE LA LONGITUD DE LAS LISTAS.

In [247]:
def unir_dataframes(df_Higgs, df_fondo):
    # Agregar la columna 'origin' a cada DataFrame con el valor correspondiente
    df_Higgs['origin'] = 0
    df_fondo['origin'] = 1
    
    # Unir los DataFrames en uno solo
    df = pd.concat([df_Higgs, df_fondo], ignore_index=True)
    
    return df

In [248]:
df_norm1 = unir_dataframes(df_F_e, df_fondo1)

In [343]:
# Función para ajustar la longitud de las listas a 4 elementos
def ajustar_longitud(lista):
    if len(lista) >= 4:
        return lista[:4].to_list()  # Si la lista tiene más de 4 elementos, se corta a los primeros 4 elementos
    else:
        return lista.to_list() + [0] * (4 - len(lista))  # Si la lista tiene menos de 4 elementos, se agregan ceros al final

# Nombres de las columnas con listas
columnas_con_listas = ['Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge', 'Muon_dxy', 
                       'Muon_dxyErr', 'Muon_dz', 'Muon_dzErr', 'Muon_pfRelIso03_all', 
                       'Muon_pfRelIso04_all', 'Electron_pt', 'Electron_eta', 'Electron_phi', 
                       'Electron_mass', 'Electron_charge', 'Electron_dxy', 'Electron_dxyErr', 
                       'Electron_dz', 'Electron_dzErr', 'Electron_pfRelIso03_all']

# Aplicar la función de ajuste de longitud a cada columna con listas
for columna in columnas_con_listas:
    df_norm1[columna] = df_norm1[columna].apply(ajustar_longitud)

# Ahora el dataframe tendrá las listas ajustadas a una longitud de 4 elementos

In [348]:
df_norm1['Muon_pt'][0]

[0, 0, 0, 0]

In [373]:
# Lista para almacenar las filas aplanadas
fila_aplanada = []
filas_aplanadas = []

# Aplicar la función de aplanado a cada fila del dataframe y guardar los resultados
for row in df_norm1.values:
    for item in row:
        if isinstance(item, list):
            for i in item:
             fila_aplanada.append(i)
        else:
            fila_aplanada.append(item)
    filas_aplanadas.append(fila_aplanada)
    print(filas_aplanadas)


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



KeyboardInterrupt: 

In [370]:
df_aplanado1.iloc[0][0]

  df_aplanado1.iloc[0][0]


array([199569, 90, 111437792, 19, 0.07278885692358017,
       0.056939758360385895, -5.848634719848633, 0, list([0, 0, 0, 0]),
       list([0, 0, 0, 0]), list([0, 0, 0, 0]), list([0, 0, 0, 0]),
       list([0, 0, 0, 0]), list([0, 0, 0, 0]), list([0, 0, 0, 0]),
       list([0, 0, 0, 0]), list([0, 0, 0, 0]), list([0, 0, 0, 0]),
       list([0, 0, 0, 0]), 4,
       list([31.268129348754883, 34.2927360534668, 18.056156158447266, 46.20489501953125]),
       list([-1.0150065422058105, 0.26551052927970886, -0.815973162651062, 0.4194360077381134]),
       list([0.9615615606307983, 1.8035637140274048, 2.663666009902954, -1.0629963874816895]),
       list([0.011734561994671822, 0.007448882795870304, -0.003928109537810087, 0.002506058197468519]),
       list([-1, 1, 1, -1]),
       list([0.12949016690254211, 0.0833798497915268, 0.07604551315307617, 0.03476995974779129]),
       list([-0.0015448912745341659, -0.003337335539981723, 0.0037292824126780033, -0.0007209290633909404]),
       list([0.002

In [358]:
len(df_aplanado1.iloc[0][0])

  len(df_aplanado1.iloc[0][0])


33

In [359]:
X = []
for i in range(0, len(df_aplanado)):
    X.append(df_aplanado.iloc[i][0][0:32])

  X.append(df_aplanado.iloc[i][0][0:32])


In [361]:
y = []
for i in range(0, len(df_aplanado)):
    y.append(df_aplanado.iloc[i][0][32])

  y.append(df_aplanado.iloc[i][0][32])


In [362]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el clasificador de árbol de decisión
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X_train, y_train)
# Guardar el modelo entrenado en un archivo
joblib.dump(tree_clf, 'modelo_arbol_decision3.pkl')

# Predicciones en el conjunto de prueba
y_pred = tree_clf.predict(X_test)

# Métricas de evaluación
print("Matriz de Confusión:")
print(confusion_matrix(y_test, y_pred))
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred))



ValueError: setting an array element with a sequence.

In [264]:
print(X[0])

[199569 90 111437792 19 0.07278885692358017 0.056939758360385895
 -5.848634719848633 0 <Array [] type='0 * float64'>
 <Array [] type='0 * float64'> <Array [] type='0 * float64'>
 <Array [] type='0 * float64'> <Array [] type='0 * int64'>
 <Array [] type='0 * float64'> <Array [] type='0 * float64'>
 <Array [] type='0 * float64'> <Array [] type='0 * float64'>
 <Array [] type='0 * float64'> <Array [] type='0 * float64'> 4
 <Array [31.3, 34.3, 18.1, 46.2] type='4 * float32'>
 <Array [-1.02, 0.266, -0.816, 0.419] type='4 * float32'>
 <Array [0.962, 1.8, 2.66, -1.06] type='4 * float32'>
 <Array [0.0117, 0.00745, -0.00393, 0.00251] type='4 * float32'>
 <Array [-1, 1, 1, -1] type='4 * int32'>
 <Array [0.129, 0.0834, 0.076, 0.0348] type='4 * float32'>
 <Array [-0.00154, -0.00334, 0.00373, -0.000721] type='4 * float32'>
 <Array [0.00209, 0.00324, 0.00348, 0.00312] type='4 * float32'>
 <Array [0.00167, 0.00105, -0.00196, -0.00197] type='4 * float32'>
 <Array [0.00271, 0.00171, 0.00335, 0.00259] ty

In [265]:
df_norm1

Unnamed: 0,run,luminosityBlock,event,PV_npvs,PV_x,PV_y,PV_z,nMuon,Muon_pt,Muon_eta,...,Electron_mass,Electron_charge,Electron_pfRelIso03_all,Electron_dxy,Electron_dxyErr,Electron_dz,Electron_dzErr,MET_pt,MET_phi,origin
0,199569,90,111437792,19,0.072789,0.056940,-5.848635,0,[],[],...,"[0.011734562, 0.007448883, -0.0039281095, 0.00...","[-1, 1, 1, -1]","[0.12949017, 0.08337985, 0.07604551, 0.03476996]","[-0.0015448913, -0.0033373355, 0.0037292824, -...","[0.0020910976, 0.0032436887, 0.0034847362, 0.0...","[0.0016729842, 0.0010503718, -0.0019625416, -0...","[0.0027115957, 0.0017110553, 0.0033536742, 0.0...",26.754982,-0.765865,0
1,198485,164,174781164,19,0.071224,0.063993,1.627592,0,[],[],...,"[-0.001556001, -0.027434915, -0.011324021, 0.0...","[1, 1, -1, -1]","[0.07580596, 0.2913002, 0.010462297, 0.23648414]","[-0.010870595, -0.0010292383, 0.008548134, 0.0...","[0.0064725457, 0.0029752105, 0.003899235, 0.01...","[0.0073316013, -0.0015844578, 0.0014440937, -0...","[0.005192332, 0.008086061, 0.0047288244, 0.009...",39.329945,-0.408865,0
2,200991,63,98100215,20,0.070069,0.064844,4.013040,0,[],[],...,"[0.015610503, -0.010785984, -0.008179897, 0.03...","[1, -1, 1, -1]","[0.029544499, 0.24546444, 0.09472476, 0.017060...","[0.0011224358, 0.00013478998, -0.0029456594, 0...","[0.0037127524, 0.0034437005, 0.0030963647, 0.0...","[-0.0007582277, -0.004073264, -0.0006949953, 0...","[0.0021406028, 0.004801067, 0.001996092, 0.004...",13.257597,1.632409,0
3,202016,173,191382222,17,0.068171,0.063730,-2.194838,0,[],[],...,"[-0.006848459, -0.07272212, -0.018416516, -0.0...","[1, 1, -1, -1]","[0.20863907, 0.09663167, 0.02042871, 0.017112002]","[0.0016890017, 0.00015105699, 0.004154351, -0....","[0.0035683562, 0.0033460893, 0.004749284, 0.00...","[-0.0004386099, -0.0049993764, -0.013777835, -...","[0.002464336, 0.004241985, 0.0036847752, 0.001...",20.016258,-0.630477,0
4,202299,304,421267699,16,0.067006,0.066837,-5.399827,0,[],[],...,"[0.018055923, 0.006589511, -0.006802098, 0.011...","[-1, 1, 1, -1]","[0.12592168, 0.13950656, 0.033174485, 0.09225648]","[-0.000945105, 0.0022477407, 0.0017914214, 0.0...","[0.0030123924, 0.0047502513, 0.0029534218, 0.0...","[-0.005683544, -0.0029475142, -0.00069543015, ...","[0.004509489, 0.0051081534, 0.0024546776, 0.00...",26.170612,0.563151,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,202075,81,127491992,14,0.067720,0.062621,1.982996,1,"[25.183866500854492, 25.183866500854492, 25.18...","[-0.5268416404724121, -0.5268416404724121, -0....",...,"[-0.0037961283, -0.0073733353, -0.0015408192, ...","[1, 1, -1, -1]","[0.6659388, 0.4619229, 1.7026792, 0.6708367]","[0.0007677617, -0.009147868, -0.0021321564, 0....","[0.0037523548, 0.0035305638, 0.013656805, 0.20...","[0.00280542, -5.816164, -5.817299, 0.8773416]","[0.0053295507, 0.004663713, 0.0049694865, 0.60...",30.459444,0.797589,1
121,201229,29,53160065,22,0.070426,0.068908,4.303934,1,"[5.109374046325684, 5.109374046325684, 5.10937...","[-0.9392731785774231, -0.9392731785774231, -0....",...,"[0.04697158560156822, 0.04697158560156822, 0.0...","[1, 1, 1]","[-999.0, -999.0, -999.0]","[0.0008445215644314885, 0.0008445215644314885,...","[0.0029827747493982315, 0.0029827747493982315,...","[5.803748354082927e-05, 5.803748354082927e-05,...","[0.006459023803472519, 0.006459023803472519, 0...",42.552536,-1.747689,1
122,200600,146,232822626,21,0.069416,0.066354,-7.433079,1,"[10.9365234375, 10.9365234375, 10.9365234375]","[1.0261904001235962, 1.0261904001235962, 1.026...",...,"[0.04455084726214409, 0.004139335826039314]","[1, 1]","[0.32095760107040405, -999.0]","[-0.0005260871257632971, 0.008726943284273148]","[0.006249656435102224, 0.0050197867676615715]","[-2.438697099685669, -2.4553022384643555]","[0.00933433510363102, 0.005163001827895641]",73.867386,-1.556057,1
123,199336,941,797385047,5,0.073056,0.058434,5.900356,0,[],[],...,"[-0.010397237725555897, -0.010397237725555897,...","[1, 1, 1]","[0.988757312297821, 0.988757312297821, 0.98875...","[0.0027649637777358294, 0.0027649637777358294,...","[0.003994310274720192, 0.003994310274720192, 0...","[-0.001313318032771349, -0.001313318032771349,...","[0.005385301541537046, 0.005385301541537046, 0...",12.117842,2.255863,1


### Tercer intento (exitoso): Entrenamiento del modelo mediante media de los datos de las listas.

### Caso FourElectrons

In [417]:
def unir_dataframes(df_Higgs, df_fondo):
    # Agregar la columna 'origin' a cada DataFrame con el valor correspondiente
    df_Higgs['origin'] = 0
    df_fondo['origin'] = 1
    
    # Unir los DataFrames en uno solo
    df = pd.concat([df_Higgs, df_fondo], ignore_index=True)
    
    return df

In [418]:
df_media1 = unir_dataframes(df_F_e, df_fondo1)

In [413]:
type(df_media1['Muon_pt'].iloc[0])

awkward.highlevel.Array

In [420]:
# Función para trasnformar los arrays en listas e introducir valores 0 a aquellos arrays vacíos.
def transformar_lista(lista):
    if len(lista) > 0:
        return lista.to_list()  
    else:
        return lista.to_list() + [0]   

# Nombres de las columnas con listas
columnas_con_listas = ['Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge', 'Muon_dxy', 
                       'Muon_dxyErr', 'Muon_dz', 'Muon_dzErr', 'Muon_pfRelIso03_all', 
                       'Muon_pfRelIso04_all', 'Electron_pt', 'Electron_eta', 'Electron_phi', 
                       'Electron_mass', 'Electron_charge', 'Electron_dxy', 'Electron_dxyErr', 
                       'Electron_dz', 'Electron_dzErr', 'Electron_pfRelIso03_all']

# Aplicar la función de ajuste de longitud a cada columna con listas
for columna in columnas_con_listas:
    df_media1[columna] = df_media1[columna].apply(transformar_lista)


In [424]:
type(df_media1['Muon_pt'].iloc[0])

list

In [422]:
df_media1

Unnamed: 0,run,luminosityBlock,event,PV_npvs,PV_x,PV_y,PV_z,nMuon,Muon_pt,Muon_eta,...,Electron_mass,Electron_charge,Electron_pfRelIso03_all,Electron_dxy,Electron_dxyErr,Electron_dz,Electron_dzErr,MET_pt,MET_phi,origin
0,199569,90,111437792,19,0.072789,0.056940,-5.848635,0,[0],[0],...,"[0.011734561994671822, 0.007448882795870304, -...","[-1, 1, 1, -1]","[0.12949016690254211, 0.0833798497915268, 0.07...","[-0.0015448912745341659, -0.003337335539981723...","[0.0020910976454615593, 0.0032436887267977, 0....","[0.0016729842172935605, 0.0010503717930987477,...","[0.0027115957345813513, 0.0017110552871599793,...",26.754982,-0.765865,0
1,198485,164,174781164,19,0.071224,0.063993,1.627592,0,[0],[0],...,"[-0.001556001021526754, -0.02743491530418396, ...","[1, 1, -1, -1]","[0.07580596208572388, 0.2913002073764801, 0.01...","[-0.010870595462620258, -0.0010292383376508951...","[0.006472545675933361, 0.0029752105474472046, ...","[0.007331601344048977, -0.0015844578156247735,...","[0.005192331969738007, 0.008086061105132103, 0...",39.329945,-0.408865,0
2,200991,63,98100215,20,0.070069,0.064844,4.013040,0,[0],[0],...,"[0.015610503032803535, -0.010785983875393867, ...","[1, -1, 1, -1]","[0.029544498771429062, 0.24546444416046143, 0....","[0.001122435787692666, 0.00013478998153004795,...","[0.003712752368301153, 0.0034437004942446947, ...","[-0.0007582277175970376, -0.004073264077305794...","[0.002140602795407176, 0.004801067057996988, 0...",13.257597,1.632409,0
3,202016,173,191382222,17,0.068171,0.063730,-2.194838,0,[0],[0],...,"[-0.006848459132015705, -0.07272212207317352, ...","[1, 1, -1, -1]","[0.20863907039165497, 0.09663166850805283, 0.0...","[0.0016890016850084066, 0.00015105698548723012...","[0.0035683561582118273, 0.0033460892736911774,...","[-0.00043860988807864487, -0.00499937636777758...","[0.002464336110278964, 0.00424198480322957, 0....",20.016258,-0.630477,0
4,202299,304,421267699,16,0.067006,0.066837,-5.399827,0,[0],[0],...,"[0.018055923283100128, 0.006589510943740606, -...","[-1, 1, 1, -1]","[0.12592168152332306, 0.13950656354427338, 0.0...","[-0.0009451049845665693, 0.0022477407474070787...","[0.0030123924370855093, 0.004750251304358244, ...","[-0.0056835440918803215, -0.002947514178231358...","[0.004509489051997662, 0.005108153447508812, 0...",26.170612,0.563151,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,202075,81,127491992,14,0.067720,0.062621,1.982996,1,[25.183866500854492],[-0.5268416404724121],...,"[-0.003796128323301673, -0.007373335305601358,...","[1, 1, -1, -1]","[0.6659387946128845, 0.46192291378974915, 1.70...","[0.0007677617250010371, -0.009147867560386658,...","[0.0037523547653108835, 0.003530563786625862, ...","[0.0028054199647158384, -5.816164016723633, -5...","[0.005329550709575415, 0.004663712810724974, 0...",30.459444,0.797589,1
121,201229,29,53160065,22,0.070426,0.068908,4.303934,1,[5.109374046325684],[-0.9392731785774231],...,[0.04697158560156822],[1],[-999.0],[0.0008445215644314885],[0.0029827747493982315],[5.803748354082927e-05],[0.006459023803472519],42.552536,-1.747689,1
122,200600,146,232822626,21,0.069416,0.066354,-7.433079,1,[10.9365234375],[1.0261904001235962],...,"[0.04455084726214409, 0.004139335826039314]","[1, 1]","[0.32095760107040405, -999.0]","[-0.0005260871257632971, 0.008726943284273148]","[0.006249656435102224, 0.0050197867676615715]","[-2.438697099685669, -2.4553022384643555]","[0.00933433510363102, 0.005163001827895641]",73.867386,-1.556057,1
123,199336,941,797385047,5,0.073056,0.058434,5.900356,0,[0],[0],...,[-0.010397237725555897],[1],[0.988757312297821],[0.0027649637777358294],[0.003994310274720192],[-0.001313318032771349],[0.005385301541537046],12.117842,2.255863,1


In [436]:
columns_with_lists = ['Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge', 'Muon_dxy', 
                       'Muon_dxyErr', 'Muon_dz', 'Muon_dzErr', 'Muon_pfRelIso03_all', 
                       'Muon_pfRelIso04_all', 'Electron_pt', 'Electron_eta', 'Electron_phi', 
                       'Electron_mass', 'Electron_charge', 'Electron_dxy', 'Electron_dxyErr', 
                       'Electron_dz', 'Electron_dzErr', 'Electron_pfRelIso03_all']
# Iterar sobre las filas del DataFrame original
for index, row in df_media1.iterrows():
    for column in columns_with_lists:
        if isinstance(row[column], list):  # Verificar si la columna contiene una lista
            # Calcular la media de los valores en la lista
            mean_value = sum(row[column]) / len(row[column])
            # Actualizar el valor en el DataFrame original con la media
            df_media1.at[index, column] = mean_value

# Imprimir el DataFrame de medias
df_media1

Unnamed: 0,run,luminosityBlock,event,PV_npvs,PV_x,PV_y,PV_z,nMuon,Muon_pt,Muon_eta,...,Electron_mass,Electron_charge,Electron_pfRelIso03_all,Electron_dxy,Electron_dxyErr,Electron_dz,Electron_dzErr,MET_pt,MET_phi,origin
0,199569,90,111437792,19,0.072789,0.056940,-5.848635,0,0.0,0.0,...,0.00444,0.0,0.080921,-0.000468,0.002985,-0.000302,0.002592,26.754982,-0.765865,0
1,198485,164,174781164,19,0.071224,0.063993,1.627592,0,0.0,0.0,...,-0.000979,0.0,0.153513,0.002693,0.006339,-0.000308,0.006834,39.329945,-0.408865,0
2,200991,63,98100215,20,0.070069,0.064844,4.013040,0,0.0,0.0,...,0.00738,0.0,0.096699,-0.000372,0.003616,-0.001194,0.003258,13.257597,1.632409,0
3,202016,173,191382222,17,0.068171,0.063730,-2.194838,0,0.0,0.0,...,-0.042149,0.0,0.085703,0.001344,0.003345,-0.004843,0.002949,20.016258,-0.630477,0
4,202299,304,421267699,16,0.067006,0.066837,-5.399827,0,0.0,0.0,...,0.007289,0.0,0.097715,0.001567,0.004357,0.000766,0.004542,26.170612,0.563151,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,202075,81,127491992,14,0.067720,0.062621,1.982996,1,25.183867,-0.526842,...,0.000326,0.0,0.875344,0.080511,0.057714,-2.688329,0.15384,30.459444,0.797589,1
121,201229,29,53160065,22,0.070426,0.068908,4.303934,1,5.109374,-0.939273,...,0.046972,1.0,-999.0,0.000845,0.002983,0.000058,0.006459,42.552536,-1.747689,1
122,200600,146,232822626,21,0.069416,0.066354,-7.433079,1,10.936523,1.02619,...,0.024345,1.0,-499.339521,0.0041,0.005635,-2.447,0.007249,73.867386,-1.556057,1
123,199336,941,797385047,5,0.073056,0.058434,5.900356,0,0.0,0.0,...,-0.010397,1.0,0.988757,0.002765,0.003994,-0.001313,0.005385,12.117842,2.255863,1


In [439]:
X_columns = df_media1.keys()[0:32]

In [442]:
X = df_media1[X_columns].values
y = df_media1['origin']

In [443]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el clasificador de árbol de decisión
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X_train, y_train)
# Guardar el modelo entrenado en un archivo
joblib.dump(tree_clf, 'modelo_arbol_decision3.pkl')

# Predicciones en el conjunto de prueba
y_pred = tree_clf.predict(X_test)

# Métricas de evaluación
print("Matriz de Confusión:")
print(confusion_matrix(y_test, y_pred))
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred))


Matriz de Confusión:
[[ 6  0]
 [ 0 19]]

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        19

    accuracy                           1.00        25
   macro avg       1.00      1.00      1.00        25
weighted avg       1.00      1.00      1.00        25



### Caso TwoMuonsTwoElectrons

In [444]:
df_media2 = unir_dataframes(df_twotwo, df_fondo2)

In [447]:
len(df_media2)

144

In [448]:
# Función para ajustar la longitud de las listas a 4 elementos
def transformar_lista(lista):
    if len(lista) > 0:
        return lista.to_list()  
    else:
        return lista.to_list() + [0]   

# Nombres de las columnas con listas
columnas_con_listas = ['Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge', 'Muon_dxy', 
                       'Muon_dxyErr', 'Muon_dz', 'Muon_dzErr', 'Muon_pfRelIso03_all', 
                       'Muon_pfRelIso04_all', 'Electron_pt', 'Electron_eta', 'Electron_phi', 
                       'Electron_mass', 'Electron_charge', 'Electron_dxy', 'Electron_dxyErr', 
                       'Electron_dz', 'Electron_dzErr', 'Electron_pfRelIso03_all']

# Aplicar la función de ajuste de longitud a cada columna con listas
for columna in columnas_con_listas:
    df_media2[columna] = df_media2[columna].apply(transformar_lista)

In [449]:
columns_with_lists = ['Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge', 'Muon_dxy', 
                       'Muon_dxyErr', 'Muon_dz', 'Muon_dzErr', 'Muon_pfRelIso03_all', 
                       'Muon_pfRelIso04_all', 'Electron_pt', 'Electron_eta', 'Electron_phi', 
                       'Electron_mass', 'Electron_charge', 'Electron_dxy', 'Electron_dxyErr', 
                       'Electron_dz', 'Electron_dzErr', 'Electron_pfRelIso03_all']
# Iterar sobre las filas del DataFrame original
for index, row in df_media2.iterrows():
    for column in columns_with_lists:
        if isinstance(row[column], list):  # Verificar si la columna contiene una lista
            # Calcular la media de los valores en la lista
            mean_value = sum(row[column]) / len(row[column])
            # Actualizar el valor en el DataFrame original con la media
            df_media2.at[index, column] = mean_value

# Imprimir el DataFrame de medias
df_media2

Unnamed: 0,run,luminosityBlock,event,PV_npvs,PV_x,PV_y,PV_z,nMuon,Muon_pt,Muon_eta,...,Electron_mass,Electron_charge,Electron_pfRelIso03_all,Electron_dxy,Electron_dxyErr,Electron_dz,Electron_dzErr,MET_pt,MET_phi,origin
0,201824,229,201387532,10,0.071230,0.065763,0.241662,2,78.122416,0.94187,...,-0.015073,0.0,0.091777,0.000231,0.003584,0.000944,0.002509,20.880842,0.245600,0
1,202504,1191,1394353502,10,0.069873,0.065985,5.496422,2,50.647063,-0.029456,...,0.003689,0.0,0.139169,0.000351,0.002819,-0.000661,0.001928,15.250550,1.270668,0
2,202237,973,1294338754,26,0.069027,0.063396,-4.145526,2,67.142473,1.050906,...,0.005965,0.0,0.111858,0.000447,0.004814,-0.005182,0.003656,30.707270,-1.457306,0
3,202973,839,779714361,6,0.070262,0.061147,-1.260335,2,69.083969,2.119485,...,0.006886,0.0,0.107774,-0.000417,0.002353,0.002352,0.002165,16.878271,-0.230254,0
4,200075,377,451096187,25,0.072787,0.063041,-1.004149,2,64.128045,-0.925428,...,0.045556,0.0,0.178199,-0.001031,0.003454,-0.002227,0.006208,26.620871,-2.009348,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,201191,709,1018911677,18,0.068617,0.065635,3.457624,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.407484,0.496572,1
140,199832,238,293201611,9,0.066765,0.062052,2.480196,0,0.0,0.0,...,-0.016668,1.0,0.412228,-0.018433,0.008987,0.012393,0.01337,3.732740,-1.639454,1
141,201625,378,557707429,17,0.070173,0.061559,-1.244230,0,0.0,0.0,...,-0.010687,1.0,0.577731,0.001517,0.00306,0.00192,0.004534,25.617329,-2.245232,1
142,199608,1881,1853810054,11,0.071694,0.058108,0.567230,0,0.0,0.0,...,0.010955,1.0,-999.0,0.001457,0.002906,-0.00264,0.003062,33.986168,0.902040,1


In [452]:
X_columns = df_media2.keys()[0:32]
X = df_media2[X_columns].values
y = df_media2['origin']

In [453]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el clasificador de árbol de decisión
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X_train, y_train)
# Guardar el modelo entrenado en un archivo
joblib.dump(tree_clf, 'modelo_arbol_decision4.pkl')

# Predicciones en el conjunto de prueba
y_pred = tree_clf.predict(X_test)

# Métricas de evaluación
print("Matriz de Confusión:")
print(confusion_matrix(y_test, y_pred))
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred))


Matriz de Confusión:
[[ 8  0]
 [ 1 20]]

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       1.00      0.95      0.98        21

    accuracy                           0.97        29
   macro avg       0.94      0.98      0.96        29
weighted avg       0.97      0.97      0.97        29

