This is the notebook to build the data processing pipeline to feed into the GRU network. It starts with a slimmed and skimmed csv file with p2_calib being the value to predict, and int_deliv_inv_ub and calib_time being the best values to use. It's important to note that at this point, some of the lumi measurements are taken a bit far from the calibration times. this can be seen by the difference between the lumi-section time and the calib_time values. The best data probably just consistents of the points where these are close.

All data is in its original units.

Also, I'm running this in the Python 3.7.1 Kernel

In [1]:
#import the stuff
import pandas as pd #dataframes etc
import matplotlib.pyplot as plt #plotting
import numpy as np
from sklearn import preprocessing
from common.utils import TimeSeriesTensor, create_evaluation_df, mape, scale_shrinker

In [15]:
year = '2016'
yr = '16'

In [16]:
#load the data
df = pd.read_csv('../scripts/slimmed_df_LME38_PNA4_'+year+'.csv')
#there are many times when the xtal is not fired upon. This means the laser_datetime is NaN, 
#and importantly, the lumi columns: int_deliv_inv_ub and int_record_inv_ub give an erroneous value.
print(df.shape)
df_laser_dropped = df.dropna(subset=['laser_datetime'])
print(df_laser_dropped.shape)

(1628400, 37)
(1319200, 37)


In [17]:
df_laser_dropped.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,iov_idx,fill,temperature,good,t1,seq_datetime,start_ts,stop_ts,...,#run:fill,ls,beamstatus,E(GeV),delivered(/ub),recorded(/ub),avgpu,source,int_deliv_inv_ub,int_record_inv_ub
166,54000,166,169.0,0.0,23.0,1.0,1458057000.0,2016-03-15 15:43:59,1458057000.0,1458058000.0,...,271031:4851,60:0,FLAT TOP,6500.0,0.00822,0.0,0.0,PLTZERO,0.00822,0.0
167,54000,167,170.0,0.0,23.0,1.0,1458409000.0,2016-03-19 17:42:44,1458409000.0,1458418000.0,...,271031:4851,60:0,FLAT TOP,6500.0,0.00822,0.0,0.0,PLTZERO,0.00822,0.0
168,54000,168,171.0,0.0,23.0,1.0,1458412000.0,2016-03-19 18:27:08,1458409000.0,1458418000.0,...,271031:4851,60:0,FLAT TOP,6500.0,0.00822,0.0,0.0,PLTZERO,0.00822,0.0
169,54000,169,172.0,0.0,23.0,1.0,1458415000.0,2016-03-19 19:11:31,1458409000.0,1458418000.0,...,271031:4851,60:0,FLAT TOP,6500.0,0.00822,0.0,0.0,PLTZERO,0.00822,0.0
171,54000,171,174.0,4726.0,23.0,1.0,1458592000.0,2016-03-21 20:25:45,1458592000.0,1458593000.0,...,271031:4851,60:0,FLAT TOP,6500.0,0.00822,0.0,0.0,PLTZERO,0.00822,0.0


In [18]:
#the data has multiple possible indices. The most important, is to split the data based on the crystal index which is called Unnamed: 0 at the moment.
df_laser_dropped = df_laser_dropped.rename(columns={'Unnamed: 0':'xtal_idx', 'Unnamed: 1':'dst_idx'})
xtal_groups = df.groupby(df_laser_dropped.xtal_idx)
xtal_list = df_laser_dropped.xtal_idx.unique()


In [19]:
max_xtals = 1

for n, xtal in enumerate(xtal_list):
    if n == max_xtals :
        break
    df = xtal_groups.get_group(xtal)
    print('Extracting crystal: {}'.format(xtal))
    #now we'll select just the columns we want to study
    df_slim = df.loc[:,['laser_datetime','calibration','int_deliv_inv_ub']]
    #set the index to the sequence start time
    df_slim['laser_datetime'] = df_slim['laser_datetime'].astype('datetime64')
    df_slim.set_index('laser_datetime', inplace=True)
    #At this point we need to filter out garbage values of the calibration
    df_filtered = df_slim[df_slim.calibration > 0.5]
    
    
    #split the data into training, testing and validating
    #select the data range (we'll select the part of the year when the beam was operating)
    data_begin = year+'-05-12 08:00:00'
    data_end = year+'-12-02 16:00:00'

    df_select = df_filtered[(df_filtered.index > data_begin) & (df_filtered.index < data_end)]
    df_select.to_csv('../data/select.csv')
    #select the time ranges for train,valid,test datasets
    df_train_end = year+'-08-31 00:00:00'
    df_valid_end = year+'-10-15 00:00:00'
    df_test_end = data_end

    df_train = df_select[df_select.index < df_train_end]
    df_valid = df_select[(df_select.index < df_valid_end) & (df_select.index > df_train_end)]
    df_test = df_select[(df_select.index < df_test_end) & (df_select.index > df_valid_end)]

    #now that we've looked at the raw data, we have to do a few manipulations.
    #the first is to convert the data into an evenly spaced timeseries. We'll use a rolling average
    import traces as ts

    sampling=3600
    avg_window = sampling*6
    #windows are in seconds
    train_calib_avg = ts.TimeSeries(df_train['calibration'])\
                                    .moving_average(sampling, window_size=avg_window, pandas=True)
    train_lumi_avg = ts.TimeSeries(df_train['int_deliv_inv_ub'])\
                                    .moving_average(sampling, window_size=avg_window, pandas=True)

    #now that we've explored the interpolation and made interpolated training data,
    #we'll follow the same steps for the testing and validation data
    valid_calib_avg = ts.TimeSeries(df_valid['calibration'])\
                                    .moving_average(sampling, window_size=avg_window, pandas=True)
    valid_lumi_avg = ts.TimeSeries(df_valid['int_deliv_inv_ub'])\
                                    .moving_average(sampling, window_size=avg_window, pandas=True)

    test_calib_avg = ts.TimeSeries(df_test['calibration'])\
                                    .moving_average(sampling, window_size=avg_window, pandas=True)
    test_lumi_avg = ts.TimeSeries(df_test['int_deliv_inv_ub'])\
                                    .moving_average(sampling, window_size=avg_window, pandas=True)

    #now we'll put it all back into dataframes
    suffix = '_'+yr+'_{}.csv'.format(xtal)
    
    train = pd.DataFrame()
    train['calibration'] = train_calib_avg
    train['int_deliv_inv_ub'] = train_lumi_avg
    train.to_csv('../data/train_avg'+suffix)

    valid = pd.DataFrame()
    valid['calibration'] = valid_calib_avg
    valid['int_deliv_inv_ub'] = valid_lumi_avg
    valid.to_csv('../data/valid_avg'+suffix)

    test = pd.DataFrame()
    test['calibration'] = test_calib_avg
    test['int_deliv_inv_ub'] = test_lumi_avg
    test.to_csv('../data/test_avg'+suffix)

    #now we need to de-trend the data so that it is flat over time
    #for this I'll just apply differencing, since it is pretty simple
    train = train.diff().dropna() #we run dropna, since the first row is NaN in a diff
    valid = valid.diff().dropna()
    test = test.diff().dropna()

    train.to_csv('../data/train_diff'+suffix)
    valid.to_csv('../data/valid_diff'+suffix)
    test.to_csv('../data/test_diff'+suffix)

Extracting crystal: 54000
