In [1]:
# Preprocessing
import pandas as pd
import tables
import time
import numpy as np
import gc
from tqdm import tqdm_notebook
import os
from sklearn.model_selection import train_test_split
import tensorflow as tf
import sys
import matplotlib.pyplot as plt
sys.path.append('/home/iprovilkov/data/JUNO/notebooks/')
from data_utils.data_generator import DataGenerator
from data_utils.data_processing import get_data_2dprojection

Using TensorFlow backend.


In [2]:
LTRAIN = '/mnt/iprovilkov/data_dir/npdata/'
#rg = np.arange(0,100000,20000)
MAXR=17200

In [3]:
import uproot
import numpy as np
import pandas as pd
from tqdm import tqdm
import tables

In [4]:
ROOT_DIR = '/mnt/ymalyshkin/J17v1r1_hits/'

In [5]:
def read_hits(hits):
    nHits = hits.get('nHits').array()
    pmtID = hits.get('pmtID').array()
    hitTime = hits.get('hitTime').array()
    isDN = hits.get('isDN').array()

    return nHits, pmtID, hitTime, isDN


def read_pos(pos):
    pmt_id = pos.get('pmt_id').array() 
    pmt_x = pos.get('pmt_x').array() 
    pmt_y = pos.get('pmt_y').array() 
    pmt_z = pos.get('pmt_z').array() 

    return pmt_id, pmt_x, pmt_y, pmt_z


def read_true_info(true_info):
    evtID = true_info.get('evtID').array()  
    E = true_info.get('E').array() 
    x = true_info.get('x').array() 
    y = true_info.get('y').array() 
    z = true_info.get('z').array() 
    R = true_info.get('R').array() 

    return evtID, E, x, y, z, R

In [6]:
def hits_to_df(r):
    nHits, pmtID, hitTime, isDN = read_hits(r)
    
    dt_list = []
    for i in tqdm(range(len(nHits))):
        n = nHits[i]
        
        df2 = pd.DataFrame({
            'event': i,
            'pmtID': pmtID[i], 
            'hitTime': hitTime[i], 
            'isDN': isDN[i]
                             
        }) 
        dt_list.append(df2)
    
    return pd.concat(dt_list)

def pos_to_df(r):
    pmt_id, pmt_x, pmt_y, pmt_z = read_pos(r)
    df = pd.DataFrame({ 'pmt_id': pmt_id, 'pmt_x': pmt_x, 
                        'pmt_y': pmt_y, 'pmt_z': pmt_z
                     })
    return df

def true_info_to_df(r):
    evtID, E, x, y, z, R = read_true_info(r)
    df = pd.DataFrame({ 'evtID': evtID, 'E': E, 'x': x,
                        'y': y, 'z': z, 'R': R
                     })
    return df

In [7]:
def hits_to_hdf(t, name):
    nHits, pmtID, hitTime, isDN = read_hits(t)
    
    dt_list = []
    for i in tqdm(range(len(nHits))):
        n = nHits[i]
        
        df2 = pd.DataFrame({
            'event': i,
            'pmtID': pmtID[i], 
            'hitTime': hitTime[i], 
            'isDN': isDN[i]
                             
        }) 
        dt_list.append(df2)
    
    pd.concat(dt_list).to_hdf(name, index=False, key='df', mode='w')


def pos_to_csv(t, name):
    pmt_id, pmt_x, pmt_y, pmt_z = read_pos(t)
    df = pd.DataFrame({ 'pmt_id': pmt_id, 'pmt_x': pmt_x, 
                        'pmt_y': pmt_y, 'pmt_z': pmt_z
                     })
    df.to_csv(name, index=False)


def true_info_to_csv(t, name):
    evtID, E, x, y, z, R = read_true_info(t)
    df = pd.DataFrame({ 'evtID': evtID, 'E': E, 'x': x,
                        'y': y, 'z': z, 'R': R
                     })
    df.to_csv(name, index=False)

In [8]:
root_names = [l for l in os.listdir(ROOT_DIR) if '_hits_' in l]

In [10]:
root_names

['eplus_hits_dn_7.root',
 'eplus_hits_dn_6.root',
 'eplus_hits_dn_3.root',
 'eplus_hits_dn_9.root',
 'eplus_hits_dn_0.root',
 'eplus_hits_dn_8.root',
 'eplus_hits_dn_4.root',
 'eplus_hits_dn_2.root',
 'eplus_hits_dn_1.root',
 'eplus_hits_dn_5.root']

In [9]:
for i, root_file in enumerate(root_names):
    print(i)
    r = uproot.open(ROOT_DIR + root_file)
    tinfo = r.get("true_info")
    true_info = true_info_to_df(tinfo)
    true_info.to_csv(LTRAIN + '_' + str(i) + '_true_info.csv',index=False)

0
1
2
3
4
5
6
7
8
9


**Noise**

In [11]:
from data_utils.data_processing_noise import get_data_2dprojection_noise

In [12]:
for i, root_file in enumerate(root_names):
    print(i)
    if i > 1:
        continue
    r = uproot.open(ROOT_DIR + root_file)
    lhits = r.get("lpmt_hits")
    #shits = r.get("spmt_hits")
    tinfo = r.get("true_info")
    lpos = r.get("lpmt_pos")
    spos = r.get("spmt_pos")
    
    lpmt_hits = hits_to_df(lhits)
    pos1 = pos_to_df(lpos)
    pos2 = pos_to_df(spos)
    pos = pd.concat([pos1,pos2])
    del pos1
    del pos2
    true_info = true_info_to_df(tinfo)
    
    rg = np.arange(0,100001,20000)
    for start, end  in zip(rg[:-1], rg[1:]):
        data_lpmt, event_to_id = get_data_2dprojection_noise(lpmt_hits[(lpmt_hits['event'] >= start) 
                                                                 & (lpmt_hits['event'] < end)], 
                                                None, 
                                                pos, 
                                                true_info[(true_info.R<=MAXR) 
                                                             & (true_info['evtID'] >= start) 
                                                             & (true_info['evtID'] < end)],
                                                edge_size0=226,
                                                edge_size1=112,
                                                use_spmt=False,
                                                time='min',
                                                )
        np.save(LTRAIN + '_' + str(i) + '_' + str(start) + 'with_noise' + '_sin', data_lpmt.astype('float32'))
        #np.save(LTRAIN + '_' + str(i) + '_mask', (lpmt_hits['isDN'] == False).values)
        
    del r
    del lhits
    del true_info
    del lpos
    del spos
    
    gc.collect()

0


100%|██████████| 100000/100000 [01:52<00:00, 886.24it/s]


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


1


100%|██████████| 100000/100000 [01:49<00:00, 914.14it/s]


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


2
3
4
5
6
7
8
9


**Simple**

In [None]:
for i, root_file in enumerate(root_names):
    print(i)
    if i in [9]:
        continue
    r = uproot.open(ROOT_DIR + root_file)
    lhits = r.get("lpmt_hits")
    #shits = r.get("spmt_hits")
    tinfo = r.get("true_info")
    lpos = r.get("lpmt_pos")
    spos = r.get("spmt_pos")
    
    lpmt_hits = hits_to_df(lhits)
    pos1 = pos_to_df(lpos)
    pos2 = pos_to_df(spos)
    pos = pd.concat([pos1,pos2])
    del pos1
    del pos2
    true_info = true_info_to_df(tinfo)
    
    rg = np.arange(0,100001,20000)
    for start, end  in zip(rg[:-1], rg[1:]):
        data_lpmt, event_to_id = get_data_2dprojection(lpmt_hits[(lpmt_hits['event'] >= start) 
                                                                 & (lpmt_hits['event'] < end)
                                                     & (lpmt_hits['isDN'] == False)], 
                                                None, 
                                                pos, 
                                                true_info[(true_info.R<=MAXR) 
                                                             & (true_info['evtID'] >= start) 
                                                             & (true_info['evtID'] < end)],
                                                edge_size0=226,
                                                edge_size1=112,
                                                use_spmt=False,
                                                time='min',
                                                )
        np.save(LTRAIN + '_' + str(i) + '_' + str(start) + 'noise' + '_sin', data_lpmt.astype('float32'))
        #np.save(LTRAIN + '_' + str(i) + '_mask', (lpmt_hits['isDN'] == False).values)
        
    del r
    del lhits
    del true_info
    del lpos
    del spos
    
    gc.collect()

0


100%|██████████| 100000/100000 [01:39<00:00, 1002.20it/s]


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


1


100%|██████████| 100000/100000 [01:39<00:00, 1000.17it/s]


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


2


100%|██████████| 100000/100000 [01:43<00:00, 963.95it/s]


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


3


100%|██████████| 100000/100000 [01:40<00:00, 991.00it/s]


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Starting cycle...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))