# IceCube: Creating a dataset

This notebook creates a dataset on google drive to train the model.
Initial training batches are also uploaded to google drive.

Every 5 training batches were collected in a pack of 1m examples.<br>
These examples were sorted by sequence length and combined into groups of the same length. 

## Load data from disk

In [None]:
%%time
CREATE_DATASET      = True          # start dataset generation
LOAD_FROM_DISK      = True          # download kaggle source batches from google drive

DROP_AUX            = False         # throw out aux==True
DOMS_AGG            = False         # aggregate by sensors (time of the first)
T_MAX               = 512           # maximum number of pulses in an event
IS_AGG              = True          # generate aggregated event features

LINES_FILTER        = False         # set a filter by the number of strings in the event
LINES_MIN           = 0             # minimum number of strings (if LINES_FILTER==True)
LINES_MAX           = 100           # maximum number of strings (if LINES_FILTER==True)
#-------------------------------------------------------------------------------

FIRST_BATCH_ID   = 1                # number of the first batch for training and validation
NUM_BATCHES      = 10               # total number of batches for training and validation
BATCHES_IN_PACK  = 5                # number of batches per group for training and validation

# folder for the resulting dataset:
DATASET_FOLDER = "/content/drive/MyDrive/IceCube/IceCube-Dataset/ALL_512"

#===============================================================================

import os, gc, sys, time, datetime, math, random,  psutil
import numpy as np
import matplotlib.pyplot as plt
from   pathlib   import Path        
from   tqdm.auto import tqdm
import pandas as pd
import pyarrow, pyarrow.parquet as pq     # read by chanks
import torch

from psutil import virtual_memory
print(f'Your runtime has {(virtual_memory().total / 1024**3):.1f} gigabytes of available RAM\n')
#===============================================================================
# Copying competition batches from google drive
if LOAD_FROM_DISK:
    !cp /content/drive/MyDrive/IceCube/IceCube-Dataset/train_meta_splitted.zip /content/
    !unzip -q /content/train_meta_splitted.zip
    !rm       /content/train_meta_splitted.zip

    !cp /content/drive/MyDrive/IceCube/IceCube-Dataset/sensor_geometry.csv /content/
    !cp /content/drive/MyDrive/IceCube/IceCube-Dataset/scattering_and_absorption.csv /content/
            
    for batch_id in tqdm(range(FIRST_BATCH_ID, FIRST_BATCH_ID + NUM_BATCHES)):
        !cp    /content/drive/MyDrive/IceCube/IceCube-Dataset/train/batch_{batch_id}.parquet /content/

Your runtime has 12.7 gigabytes of available RAM



  0%|          | 0/10 [00:00<?, ?it/s]

CPU times: user 3.72 s, sys: 697 ms, total: 4.42 s
Wall time: 2min 29s


## Prepare Data Functions

In [None]:
#===============================================================================
PATH      = Path("/content")                  # path to dataset
PATH_PHYS = Path("/content")                  # path to dataset
PATH_META = Path("/content/content/icecube-neutrinos-in-deep-ice/train_meta")
files_trn = [item for item in (PATH  / "train").glob('*')]  # all train files
print(f"{len(files_trn):3d} train files")
#===============================================================================

def info(text, pref="", end="\n"):
    """ 
    Information about the progress of calculations (time and memory) 
    """
    gc.collect()
    ram, t = psutil.virtual_memory().used / 1024**3,  time.time()    
    print(f"{pref}{(t-info.beg)/60:5.1f}m[{t-info.last:+5.1f}s] {ram:6.3f}Gb > {text}",end=end)
    info.last = time.time(); 
info.beg = info.last = time.time()

#-------------------------------------------------------------------------------

def get_sensors():
    """ 
    Get sensor positions 
    """            
    df = pd.read_csv(PATH / "sensor_geometry.csv")      
    df['line_id'] = df.sensor_id // 60 + 1                 # string id
    df['core']    = (df.line_id > 78).astype(np.float32)   # sensor from DeepCore
    df.x = ( df.x * 1e-3 ).astype(np.float32)              # distances in kilometers
    df.y = ( df.y * 1e-3 ).astype(np.float32)
    df.z = ( df.z * 1e-3 ).astype(np.float32)    
    
    from scipy.interpolate import interp1d                 # add absorption
    phys = pd.read_csv(PATH_PHYS / "scattering_and_absorption.csv")
    phys.z = (phys.z * 1e-3).astype(np.float32)
    phys.a = (phys.a * 1e-2).astype(np.float32)
    interp = interp1d(phys.z, phys.a)
    df['a'] = interp(df.z)

    df['r'] = np.sqrt(df.x**2 + df.y**2)

    return df[['sensor_id', 'line_id', 'core', 'x', 'y', 'z', 'a', 'r']]

#-------------------------------------------------------------------------------

def get_target_angles(batch_id=1):
    """ 
    Get target angles for batch with batch_id 
    """    
    assert batch_id > 0 and  batch_id < 661, "Wrong batch_id"        
    df = pd.read_parquet(PATH_META / f"batch_{batch_id}_meta.parquet")
    df.event_id= df.event_id.astype(np.int64)      
    df.azimuth = df.azimuth.astype(np.float32)
    df.zenith  = df.zenith.astype(np.float32)                        
    return df[ ['event_id','azimuth','zenith'] ]

#-------------------------------------------------------------------------------

def prepare_batch(df, verbose=True, drop_aux = DROP_AUX, doms_agg = DOMS_AGG):
    """ 
    Preparing a loaded batch, shifting and normalizing times 
    """    
    df['event_id'] = df.index.astype(np.int64)
    df = df.reset_index(drop=True)  # sensor_id, t, charge, aux, event_id    
    df.rename(columns={"time": "t", "auxiliary": "aux", 'charge': 'q'}, inplace=True)
    df.q = df.q.astype(np.float32)

    if drop_aux:
        df = df[ ~df.aux ]
    
    if doms_agg:
        df = df.groupby(['event_id', 'sensor_id']).agg(
            aux = ( 'aux', "mean"),
            q   = ( 'q',   "sum"),
            t   = ( 't',   "min"),            
        )
        df = df.reset_index()
    
    if verbose: info(f"load_batch: loaded  {df.shape}")
        
    times = df.groupby('event_id').agg( t_min = ('t', 'min') )
    df = df.merge(times, left_on='event_id', right_index=True, how='left')
    df.t = (( df.t - df.t_min ) * 0.299792458e-3 ).astype(np.float32)             
    
    if verbose: info("load_batch: shift_times")    

    return df[['event_id', 'sensor_id', 'aux', 'q', 't' ]]
        
#-------------------------------------------------------------------------------

def cut_pulses(df, max_pulses = 128, verbose=True):
    """ 
    Throw out the last and unreliable pulses in the event if there are more than max_pulses 
    """
    tot = len(df)
    df = df.sort_values(['event_id','aux','t'])          # do you need aux???
    df.reset_index(drop=True, inplace=True)    

    df = df.groupby('event_id').head(max_pulses)         # cut pulses by event
    df.reset_index(inplace=True)                         # sorted by time later!

    if not DROP_AUX:
        df = df.sort_values(['event_id','t'])        
        df.reset_index(drop=True, inplace=True)

    if verbose: info(f"cut_pulses (max={max_pulses}): removed {100*(tot-len(df))/tot:.2f}%")
    return df

#-------------------------------------------------------------------------------

def lines_filter(df, lines_min=1, lines_max=1):
    """
    Filter by number of strings
    """
    agg = df[df.aux == 0].copy().groupby('event_id').agg( lines0 = ( 'line_id',   'nunique') )
    agg.reset_index(inplace=True)    
    agg = agg[(agg.lines0 >= lines_min) & (agg.lines0 <= lines_max)]
    df = df[df.event_id.isin(agg.event_id)]
    df.reset_index(drop=True, inplace=True)  
    return df, agg
#-------------------------------------------------------------------------------

def angles2vector(df):
    """ 
    Add unit vector components from (azimuth,zenith) to the DataFrame df 
    """
    df['nx'] = np.sin(df.zenith) * np.cos(df.azimuth)
    df['ny'] = np.sin(df.zenith) * np.sin(df.azimuth)
    df['nz'] = np.cos(df.zenith) 
    return df

#-------------------------------------------------------------------------------

def delta_angle(n1, n2, eps=1e-8):
    """ 
    Calculate angles between two vectors: n1,n2: (B,3) return: (B,) 
    """
    n1 = n1 / (np.linalg.norm(n1, axis=1, keepdims=True) + eps)
    n2 = n2 / (np.linalg.norm(n2, axis=1, keepdims=True) + eps)
    cos = (n1*n2).sum(axis=1).clip(-1,1)
    return np.arccos( cos )

#-------------------------------------------------------------------------------

def get_event_features(df, target_df, suf = "", aux=True):    
    """ 
    Aggregated features characterizing the entire event 
    """

    df['xt'] = df.x*df.t;  df['yt'] = df.y*df.t; df['zt'] = df.z*df.t;  df['tt'] = df.t**2;       
    if aux:
        for col in df.columns:
            if col not in ['event_id', 'sensor_id', 'line_id']:
                df[col] = df[col] * (1-df.aux)

    df = df.groupby('event_id').agg(          # this is for all pulses with any aux
        tot      = ('t',        'count'),            
        t_med    = ('t',        'median'),  
        t        = ('t',        'mean'),  
        x        = ('x',        'mean'),  
        y        = ('y',        'mean'),  
        z        = ('z',        'mean'),  
        stdT     = ('t',        'std'),
        stdX     = ('x',        'std'),
        stdY     = ('y',        'std'),
        stdZ     = ('z',        'std'),
        xt       = ('xt',       'mean'),
        yt       = ('yt',       'mean'),
        zt       = ('zt',       'mean'),
        tt       = ('tt',       'mean'),
        q        = ('q',        'mean' ),
        q_min    = ('q',        'min' ),
        q_max    = ('q',        'max' ),
        q_med    = ('q',        'median' ),
        aux      = ('aux',      'mean' ),       
        core     = ('core',     'mean' ),        
        lines    = ('line_id',  'nunique' ),
        doms     = ('sensor_id','nunique' ),                
    )
    df.reset_index(inplace=True)    
    
    df.aux   = df.aux  .astype(np.float32)
    df.lines = df.lines.astype(np.float32)
    df.doms  = df.doms .astype(np.float32)    
    df.stdT  = df.stdT .astype(np.float32)    
    df.stdX  = df.stdX .astype(np.float32)    
    df.stdY  = df.stdY .astype(np.float32)    
    df.stdZ  = df.stdZ .astype(np.float32)    

    df['p_lines'] = np.log10(df.tot / df.lines).astype(np.float32)
    df['p_doms']  = np.log10(df.tot / df.doms ).astype(np.float32)    

    df.q         = np.log(1+df.q)
    df.q_med     = np.log(1+df.q_med)
    df.q_min     = np.log(1+df.q_min)
    df.q_max     = np.log(1+df.q_max)
    df.lines     = np.log10(df.lines)  / 10
    df.doms      = np.log10(df.doms)   / 10
    df['pulses'] =(np.log10(df.tot)    / 10).astype(np.float32)
    
    df = df.fillna(0.0)   # if exclude aux is possible problems for std?

    if len(suf):          # add a suffix to the column name
        cols = [ col + suf for col in df.columns]
        df.columns = cols

    return df

#-------------------------------------------------------------------------------

def get_pulse_features(df):
    """ """
    df.drop(columns=['line_id'], inplace=True)   # !!!! (embedding ?)

    df.q    = np.log(1+df.q)    
    for col in df.columns:
        if col not in ['sensor_id', 'event_id', 'line_id', 'tot']:
            df[col] = df[col].astype(np.float32)

    return df    

#===============================================================================
#                     Create dataset for train and validation
#===============================================================================

def get_files(batch_ids):
    files = [PATH / f"batch_{batch_id}.parquet"  for batch_id in batch_ids]
    return files, batch_ids

#-------------------------------------------------------------------------------

def append_dict(data, T, df, agg_df):
    """ 
    Add dataframes df and agg_df with the given number of pulses T to the data dictionary.
    The keys in this dictionary are the number of pulses T.
    df:     event_id	sensor_id	aux	q	t  tot
    agg_df: event_id, nx, ny, nz, tot, t_aver, ...., ux, uy, uz, qx, qy, qz
    """
    assert len(df) % T == 0,  f"wait len(df) = T*B, got len={len(df)}, T={T}"    
    B, F = len(df) // T, df.shape[-1] - 3 # drop: event_id, sensor_id, tot
    ID   = agg_df[['event_id']].to_numpy()
    Y    = agg_df[['nx','ny','nz']].to_numpy()
    AGG  = agg_df.iloc[:, 5:].to_numpy()
    SENS = df.sensor_id.to_numpy().reshape(B,T)
    # (B*T, F) -> (B, T, F) -> (B, F, T) -> (B, F*T)
    FEAT= df.iloc[:, 2: -1].to_numpy().reshape(B,T,F)      # drop tot !

    assert len(ID)==len(Y) and len(ID)==len(AGG) and len(ID)==len(SENS) and len(ID)==len(FEAT), \
           f"{ID.shape}, {Y.shape}, {AGG.shape}, {SENS.shape} {FEAT.shape} from df={df.shape} agg_df={agg_df.shape} (T={T},F={F})"

    if T in data:    # ID, Y, AGG, SENS, FEAT 
        v = data[T]
        v[0] = torch.vstack((v[0], torch.tensor(ID,   dtype=torch.long)    ))
        v[1] = torch.vstack((v[1], torch.tensor(SENS, dtype=torch.long)    ))
        v[2] = torch.cat   ((v[2], torch.tensor(FEAT, dtype=torch.float32) ), dim=0 )
        v[3] = torch.vstack((v[3], torch.tensor(AGG,  dtype=torch.float32) ))
        v[4] = torch.vstack((v[4], torch.tensor(Y,    dtype=torch.float32) ))
        
    else:       
        data[T] = [torch.tensor(ID,   dtype=torch.long   ),
                   torch.tensor(SENS, dtype=torch.long   ),
                   torch.tensor(FEAT, dtype=torch.float32),
                   torch.tensor(AGG,  dtype=torch.float32),
                   torch.tensor(Y,    dtype=torch.float32) ]                  
                       
#-------------------------------------------------------------------------------

def create_dataset(batch_ids, sensors_df, verbose):
    """ 
    Starting dataset generation
    """
    files, batch_ids = get_files(batch_ids)
    data, events_df  = {}, pd.DataFrame({'event_id': []})
    for i, (batch_id, fname) in tqdm(enumerate(zip(batch_ids, files))):         
        info(f"******  batch_id: {batch_id:3d}")
        df = pd.read_parquet(fname)            

        df = prepare_batch(df)
        df = cut_pulses(df, max_pulses=T_MAX)        

        df = df.merge(sensors_df, left_on="sensor_id", right_on="sensor_id", how="left")
        df = df[['event_id', 'line_id', 'sensor_id', 'core', 'aux', 'q', 't', 'x', 'y', 'z']]
        info(f"merged batch with sensors {df.shape}")    

        target_df = get_target_angles(batch_id=batch_id)
        target_df = angles2vector(target_df).drop(columns=['azimuth','zenith'])
        info("loaded target angles")

        if LINES_FILTER:
            df, agg = lines_filter(df, lines_min = LINES_MIN, lines_max = LINES_MAX)  
            target_df = target_df[target_df.event_id.isin(agg.event_id)]    
            del agg        
            info(f"lines filter done: {df.shape}")    
            #if verbose and i == 0: display(df)

        if IS_AGG:
            agg_df = get_event_features(df, target_df, suf="", aux=False)
            if not DROP_AUX:            
                if DOMS_AGG:  # при агригации некоторые сенсоры имеют нецелый aux (умножаем на 1-него)!
                    agg2_df = get_event_features(df, target_df, suf="_aux", aux=True)            
                else:         
                    agg2_df = get_event_features(df[ ~df.aux ].copy(), target_df, suf="_aux", aux=False)            
                agg_df = agg_df.merge(agg2_df, left_on='event_id',  right_on='event_id_aux', how='left')
                agg_df = agg_df.drop(columns = ['event_id_aux','tot_aux'] )

            agg_df = target_df.merge(agg_df, left_on="event_id", right_on="event_id", how="left")        
            info('get_event_features done')
        else:
            agg_df = target_df

        df = get_pulse_features(df)                
        df = df[['event_id', 'sensor_id', 'aux', 'q', 't']]  #   'core', 'x', 'y', 'z'

        info('get_pulse_features done')

        if IS_AGG:
            df = df.merge(agg_df[['event_id', 'tot']], left_on='event_id', right_on='event_id', how='left')
            if verbose and i == 0: show_stats(df, agg_df)

        tots = df.tot.unique()
        info(f"count pulses:  {tots.mean():.0f} [{tots.min()} ... {tots.max()}]")                                    
        for n in tqdm(tots): 
            # first pulse will be last (for RNN)
            d1 = df    [df.    tot == n].sort_values(['event_id','t'], ascending=[True,False])           
            d2 = agg_df[agg_df.tot == n].sort_values(['event_id']) if IS_AGG else None
            append_dict(data, n, d1, d2)
        cols_df, cols_agg_df = df.columns, agg_df.columns
        del df, agg_df
    info("collected data for dataset")                
        
    return data, events_df.reset_index(drop=True), cols_df, cols_agg_df

#===============================================================================
#                                Diagnostic
#===============================================================================

def show_stats(df, agg_df):
    """ 
    Displaying information about dataframes
    """
    pd.set_option('display.float_format', lambda x: '%.2f' % x)
    display(df.head(5))
    display(df.describe(percentiles=[]).transpose())    
    display(df.info())
    display(agg_df.head(2))
    display(agg_df.describe(percentiles=[]).transpose())                                        
    display(agg_df.info())

#-------------------------------------------------------------------------------

def plot_metric(err, prefix="", bins = 200):    
    """ 
    Build a histogram of errors; calculate the statistics and the share w of 'bad examples' 
    """
    plt.figure(figsize=(6,4), facecolor ='w') 
    plt.axes().set_facecolor("ivory"); plt.autoscale(tight=True)
    p,_,_ = plt.hist(err, bins=bins, range=(0,np.pi), fc="lightblue", density=True, alpha=0.5)
    w = 2*p[len(p)//2: ].sum()*np.pi/bins    
    x = np.linspace(0,np.pi,bins)
    plt.plot(x, w * 0.5*np.sin(x),   c="darkred")
    plt.plot(x, p-w * 0.5*np.sin(x), c="darkblue")
    plt.title(f"{prefix}mean={np.mean(err):.3f}, median={np.median(err):.3f}, w={w:.3f}")    
    plt.ylabel("Density"); plt.xlabel(r"$\Delta \Psi$ (rad)"); plt.grid()
    plt.show()


  0 train files


## Prepare Data and save to disk

In [None]:
%%time
info.beg = info.last = time.time()
info("begin")

if CREATE_DATASET:
    sensors_df = get_sensors()    
    info(f"loaded sensors pos: tot={len(sensors_df)}")
    display(sensors_df.head(3))
    doms = torch.tensor(sensors_df[['x','y','z','core','a','r']].astype(np.float32).to_numpy())
    torch.save( { 'cols': ['x','y','z','core','a','r'], 'data': doms }, f"{DATASET_FOLDER}/doms.pt")
    
    for i,batch_id in tqdm(enumerate(range(FIRST_BATCH_ID, FIRST_BATCH_ID+NUM_BATCHES,  BATCHES_IN_PACK)), total=NUM_BATCHES//BATCHES_IN_PACK):
        pack_id = batch_id // BATCHES_IN_PACK + 1
        data, _, cols_df, cols_agg_df = create_dataset(range(batch_id, batch_id + BATCHES_IN_PACK), sensors_df, i==0)
        torch.save({'cols_df':      cols_df, 
                    'cols_agg_df':  cols_agg_df, 
                    'data': data },   f"{DATASET_FOLDER}/pack_{pack_id:02d}.pt")        
        del data; gc.collect()
        info(f"created pack {pack_id:2d}")   

  0.0m[ +0.3s]  2.254Gb > begin
  0.0m[ +0.2s]  2.254Gb > loaded sensors pos: tot=5160


Unnamed: 0,sensor_id,line_id,core,x,y,z,a,r
0,0,1,0.0,-0.26,-0.52,0.5,0.57,0.58
1,1,1,0.0,-0.26,-0.52,0.48,0.8,0.58
2,2,1,0.0,-0.26,-0.52,0.46,1.1,0.58


  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0.0m[ +0.3s]  2.254Gb > ******  batch_id:   1
  0.1m[ +4.6s]  4.028Gb > load_batch: loaded  (32792416, 5)
  0.2m[ +4.1s]  4.180Gb > load_batch: shift_times
  0.7m[+30.0s]  4.342Gb > cut_pulses (max=512): removed 47.08%
  0.8m[ +5.9s]  3.095Gb > merged batch with sensors (17353867, 10)
  0.8m[ +0.3s]  3.118Gb > loaded target angles
  1.2m[+27.3s]  3.453Gb > get_event_features done
  1.3m[ +3.9s]  2.868Gb > get_pulse_features done


Unnamed: 0,event_id,sensor_id,aux,q,t,tot
0,24,3918,1.0,0.84,0.0,61
1,24,4157,1.0,0.78,0.06,61
2,24,3520,1.0,0.65,0.17,61
3,24,5041,1.0,0.2,0.22,61
4,24,2948,1.0,0.95,0.64,61


Unnamed: 0,count,mean,std,min,50%,max
event_id,17353867.0,1632018.51,942951.34,24.0,1629936.0,3266196.0
sensor_id,17353867.0,2643.34,1522.97,0.0,2662.0,5159.0
aux,17353867.0,0.51,0.5,0.0,1.0,1.0
q,17353867.0,0.72,0.48,0.02,0.65,7.38
t,17353867.0,1.67,0.91,0.0,1.44,11.02
tot,17353867.0,161.83,152.33,15.0,91.0,512.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 17353867 entries, 0 to 17353866
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   event_id   int64  
 1   sensor_id  int16  
 2   aux        float32
 3   q          float32
 4   t          float32
 5   tot        int64  
dtypes: float32(3), int16(1), int64(2)
memory usage: 628.9 MB


None

Unnamed: 0,event_id,nx,ny,nz,tot,t_med,t,x,y,z,...,q_min_aux,q_max_aux,q_med_aux,aux_aux,core_aux,lines_aux,doms_aux,p_lines_aux,p_doms_aux,pulses_aux
0,24,0.27,-0.83,-0.49,61,2.0,2.05,0.01,-0.03,0.03,...,0.16,0.86,0.55,0.0,0.08,0.05,0.1,0.64,0.16,0.11
1,41,0.91,0.41,0.02,51,1.47,1.67,-0.09,0.09,0.08,...,0.28,0.86,0.65,0.0,0.0,0.05,0.1,0.64,0.11,0.11


Unnamed: 0,count,mean,std,min,50%,max
event_id,200000.0,1630014.72,942630.1,24.0,1628979.0,3266196.0
nx,200000.0,-0.0,0.57,-1.0,0.0,1.0
ny,200000.0,0.0,0.58,-1.0,0.0,1.0
nz,200000.0,0.03,0.58,-1.0,0.05,1.0
tot,200000.0,86.77,80.7,15.0,63.0,512.0
t_med,200000.0,1.47,0.37,0.32,1.4,6.84
t,200000.0,1.6,0.33,0.5,1.54,6.16
x,200000.0,0.0,0.14,-0.57,0.0,0.58
y,200000.0,-0.0,0.13,-0.52,-0.0,0.51
z,200000.0,0.0,0.14,-0.49,0.02,0.49


<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 0 to 199999
Data columns (total 53 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   event_id     200000 non-null  int64  
 1   nx           200000 non-null  float32
 2   ny           200000 non-null  float32
 3   nz           200000 non-null  float32
 4   tot          200000 non-null  int64  
 5   t_med        200000 non-null  float32
 6   t            200000 non-null  float32
 7   x            200000 non-null  float32
 8   y            200000 non-null  float32
 9   z            200000 non-null  float32
 10  stdT         200000 non-null  float32
 11  stdX         200000 non-null  float32
 12  stdY         200000 non-null  float32
 13  stdZ         200000 non-null  float32
 14  xt           200000 non-null  float32
 15  yt           200000 non-null  float32
 16  zt           200000 non-null  float32
 17  tt           200000 non-null  float32
 18  q            200000 non-

None

  1.4m[ +7.7s]  3.045Gb > count pulses:  264 [15 ... 512]


  0%|          | 0/498 [00:00<?, ?it/s]

  1.9m[+26.6s]  2.585Gb > ******  batch_id:   2
  1.9m[ +4.3s]  4.299Gb > load_batch: loaded  (31948000, 5)
  2.0m[ +5.5s]  4.448Gb > load_batch: shift_times
  2.5m[+30.8s]  3.828Gb > cut_pulses (max=512): removed 45.57%
  2.6m[ +6.3s]  3.415Gb > merged batch with sensors (17389087, 10)
  2.6m[ +0.3s]  3.437Gb > loaded target angles
  3.1m[+28.1s]  3.838Gb > get_event_features done
  3.2m[ +5.1s]  3.221Gb > get_pulse_features done
  3.2m[ +3.2s]  3.385Gb > count pulses:  263 [14 ... 512]


  0%|          | 0/498 [00:00<?, ?it/s]

  3.8m[+32.2s]  2.989Gb > ******  batch_id:   3
  3.9m[ +7.0s]  4.689Gb > load_batch: loaded  (31697016, 5)
  4.0m[ +5.6s]  4.834Gb > load_batch: shift_times
  4.5m[+31.4s]  4.226Gb > cut_pulses (max=512): removed 45.11%
  4.6m[ +5.6s]  3.820Gb > merged batch with sensors (17398050, 10)
  4.6m[ +0.3s]  3.842Gb > loaded target angles
  5.0m[+25.5s]  4.268Gb > get_event_features done
  5.1m[ +4.3s]  4.494Gb > get_pulse_features done
  5.2m[ +3.2s]  3.815Gb > count pulses:  263 [14 ... 512]


  0%|          | 0/499 [00:00<?, ?it/s]

  5.6m[+27.5s]  3.405Gb > ******  batch_id:   4
  5.7m[ +4.9s]  5.185Gb > load_batch: loaded  (33187067, 5)
  5.8m[ +4.3s]  5.341Gb > load_batch: shift_times
  6.3m[+32.7s]  4.680Gb > cut_pulses (max=512): removed 47.48%
  6.4m[ +6.0s]  4.234Gb > merged batch with sensors (17430926, 10)
  6.4m[ +0.2s]  4.255Gb > loaded target angles
  6.9m[+27.3s]  4.654Gb > get_event_features done
  7.0m[ +4.5s]  4.036Gb > get_pulse_features done
  7.0m[ +3.8s]  4.202Gb > count pulses:  264 [17 ... 512]


  0%|          | 0/496 [00:00<?, ?it/s]

  7.4m[+25.7s]  3.849Gb > ******  batch_id:   5
  7.5m[ +4.9s]  5.648Gb > load_batch: loaded  (32075852, 5)
  7.6m[ +4.4s]  5.696Gb > load_batch: shift_times
  8.1m[+30.2s]  5.076Gb > cut_pulses (max=512): removed 45.66%
  8.2m[ +7.3s]  4.659Gb > merged batch with sensors (17431486, 10)
  8.2m[ +0.2s]  4.679Gb > loaded target angles
  8.7m[+26.3s]  5.059Gb > get_event_features done
  8.7m[ +4.3s]  4.439Gb > get_pulse_features done
  8.8m[ +3.0s]  4.603Gb > count pulses:  263 [12 ... 512]


  0%|          | 0/499 [00:00<?, ?it/s]

  9.2m[+27.3s]  4.264Gb > collected data for dataset
  9.9m[+42.3s]  4.288Gb > created pack  1


0it [00:00, ?it/s]

 10.0m[ +0.3s]  4.161Gb > ******  batch_id:   6
 10.1m[ +6.0s]  5.362Gb > load_batch: loaded  (32645356, 5)
 10.2m[ +6.4s]  5.356Gb > load_batch: shift_times
 10.8m[+36.8s]  4.922Gb > cut_pulses (max=512): removed 46.71%
 10.9m[ +6.4s]  5.357Gb > merged batch with sensors (17396749, 10)
 10.9m[ +0.2s]  4.471Gb > loaded target angles
 11.3m[+26.2s]  4.709Gb > get_event_features done
 11.4m[ +3.3s]  4.124Gb > get_pulse_features done
 11.4m[ +2.7s]  4.383Gb > count pulses:  264 [15 ... 512]


  0%|          | 0/498 [00:00<?, ?it/s]

 11.9m[+25.9s]  3.797Gb > ******  batch_id:   7
 11.9m[ +4.2s]  5.349Gb > load_batch: loaded  (32684896, 5)
 12.0m[ +5.4s]  5.353Gb > load_batch: shift_times
 12.5m[+30.6s]  4.920Gb > cut_pulses (max=512): removed 46.57%
 12.6m[ +6.2s]  5.359Gb > merged batch with sensors (17463065, 10)
 12.6m[ +0.3s]  4.471Gb > loaded target angles
 13.1m[+27.4s]  4.708Gb > get_event_features done
 13.1m[ +3.6s]  4.123Gb > get_pulse_features done
 13.2m[ +3.6s]  4.383Gb > count pulses:  263 [12 ... 512]


  0%|          | 0/499 [00:00<?, ?it/s]

 13.6m[+25.5s]  3.794Gb > ******  batch_id:   8
 13.7m[ +5.1s]  5.524Gb > load_batch: loaded  (32151281, 5)
 13.9m[ +8.1s]  5.441Gb > load_batch: shift_times
 14.4m[+34.6s]  5.550Gb > cut_pulses (max=512): removed 45.92%
 14.5m[ +5.6s]  4.442Gb > merged batch with sensors (17387869, 10)
 14.5m[ +0.2s]  4.464Gb > loaded target angles
 15.0m[+25.7s]  4.700Gb > get_event_features done
 15.0m[ +4.0s]  4.116Gb > get_pulse_features done
 15.1m[ +2.6s]  4.398Gb > count pulses:  264 [15 ... 512]


  0%|          | 0/498 [00:00<?, ?it/s]

 15.5m[+27.2s]  3.668Gb > ******  batch_id:   9
 15.6m[ +4.0s]  5.320Gb > load_batch: loaded  (32163798, 5)
 15.7m[ +5.4s]  5.496Gb > load_batch: shift_times
 16.2m[+30.5s]  4.805Gb > cut_pulses (max=512): removed 45.96%
 16.3m[ +6.9s]  4.476Gb > merged batch with sensors (17380195, 10)
 16.3m[ +0.4s]  4.498Gb > loaded target angles
 16.7m[+25.2s]  4.769Gb > get_event_features done
 16.8m[ +5.2s]  5.028Gb > get_pulse_features done
 16.9m[ +2.9s]  4.280Gb > count pulses:  264 [11 ... 512]


  0%|          | 0/497 [00:00<?, ?it/s]

 17.3m[+28.1s]  4.019Gb > ******  batch_id:  10
 17.4m[ +3.8s]  6.096Gb > load_batch: loaded  (33243258, 5)
 17.5m[ +4.7s]  5.912Gb > load_batch: shift_times
 18.0m[+33.0s]  5.151Gb > cut_pulses (max=512): removed 47.68%
 18.1m[ +6.4s]  5.703Gb > merged batch with sensors (17392994, 10)
 18.1m[ +0.3s]  4.819Gb > loaded target angles
 18.6m[+26.8s]  5.092Gb > get_event_features done
 18.6m[ +4.0s]  4.507Gb > get_pulse_features done
 18.7m[ +3.9s]  4.635Gb > count pulses:  263 [14 ... 512]


  0%|          | 0/498 [00:00<?, ?it/s]

 19.2m[+26.3s]  4.322Gb > collected data for dataset
 19.6m[+29.3s]  4.306Gb > created pack  2
CPU times: user 16min 13s, sys: 3min 12s, total: 19min 25s
Wall time: 19min 38s
