### Defining batch and status

For easier manipulation we need to define the batch and the status of the equipment that was at the moment when the sensor value was recorded. For that we need to use values from _BATCH_ and _STATUS_ sensors to be propagated to all other records.

In [1]:
import pandas as pd
import numpy as np
from os.path import join
from os import listdir

from datetime import timedelta

In [2]:
files = np.sort([c for c in listdir(join('..','data')) if 'target' not in c and 'csv' in c and 'leftovers' not in c])
files

array(['2019-01-01.csv', '2019-01-02.csv', '2019-01-03.csv',
       '2019-01-04.csv', '2019-01-05.csv', '2019-01-06.csv',
       '2019-01-07.csv', '2019-01-08.csv', '2019-01-09.csv',
       '2019-01-10.csv', '2019-01-11.csv', '2019-01-12.csv',
       '2019-01-13.csv', 'features.csv'], dtype='<U14')

In [3]:
def process_file(data_path, leftovers_path=None):
    ### import data
    
    data = pd.read_csv(data_path, parse_dates=['DateTime'])
    if leftovers_path:
        leftovers = pd.read_csv(leftovers_path, parse_dates=['DateTime'])
        data = leftovers.append(data)
    
    ### split tag to equipment and sensor

    data['Equipment'] = data.apply(lambda x: x.TagName.split(".")[0], axis=1)
    data['Sensor'] = data.apply(lambda x: x.TagName.split(".")[1], axis=1)

    ### define batch of each record

    batches = data[data.Sensor=='BATCH'].sort_values(['Equipment','DateTime']).reset_index()
    batches.drop(['Sensor', 'Value', 'TagName'], axis=1, inplace=True)
    batches['index'] = np.arange(len(batches))
    batches.rename(columns={'DateTime': 'PrevDateTime', 'StringValue': 'Batch'}, inplace=True)

    batches_tmp = batches.copy()
    batches_tmp.drop('Batch', axis=1, inplace=True)
    batches_tmp['index'] -= 1
    batches_tmp.rename(columns={'PrevDateTime': 'NextDateTime'}, inplace=True)

    batches = batches.merge(batches_tmp, on=['index', 'Equipment']).drop('index', axis=1)
    data_with_batch = data.merge(batches) ### ???
    data_with_batch = data_with_batch[(data_with_batch.DateTime >= data_with_batch.PrevDateTime)&
                                      (data_with_batch.DateTime < data_with_batch.NextDateTime)]
    data_with_batch.drop(['PrevDateTime', 'NextDateTime'], axis=1, inplace=True)
    del batches, batches_tmp

    ### define status of each record

    statuses = data[data.Sensor=='STATUS'].sort_values(['Equipment','DateTime']).reset_index()
    statuses.drop(['Sensor', 'Value', 'TagName'], axis=1, inplace=True)
    statuses['index'] = np.arange(len(statuses))
    statuses.rename(columns={'DateTime': 'PrevDateTime', 'StringValue': 'Status'}, inplace=True)

    statuses_tmp = statuses.copy()
    statuses_tmp.drop('Status', axis=1, inplace=True)
    statuses_tmp['index'] -= 1
    statuses_tmp.rename(columns={'PrevDateTime': 'NextDateTime'}, inplace=True)

    statuses = statuses.merge(statuses_tmp, on=['index', 'Equipment']).drop('index', axis=1)
    data_with_status = data_with_batch.merge(statuses)
    data_with_status = data_with_status[(data_with_status.DateTime >= data_with_status.PrevDateTime)&
                                        (data_with_status.DateTime < data_with_status.NextDateTime)]
    data_with_status.drop(['PrevDateTime', 'NextDateTime'], axis=1, inplace=True)
    del statuses, statuses_tmp

    ### define next time

    sensors = data.sort_values(['Equipment','Sensor','DateTime']).reset_index()
    sensors.drop(['Value', 'StringValue', 'TagName'], axis=1, inplace=True)
    sensors['index'] = np.arange(len(sensors))

    sensors_tmp = sensors.copy()
    sensors_tmp['index'] -= 1
    sensors_tmp.rename(columns={'DateTime': 'NextDateTime'}, inplace=True)

    sensors = sensors.merge(sensors_tmp, on=['index', 'Equipment', 'Sensor']).drop('index', axis=1)
    sensors['Delta'] = sensors.apply(lambda x: (x.NextDateTime-x.DateTime).seconds, axis=1)

    clean_data = data_with_status.merge(sensors)
    clean_data.drop(['NextDateTime'], axis=1, inplace=True)
    del data_with_batch, data_with_status, sensors

    ### produce output

    leftovers = data.merge(clean_data[['TagName','DateTime','Batch']], on=['TagName','DateTime'], how='left')
    leftovers = leftovers[leftovers.Batch.isna()][['TagName', 'DateTime', 'Value', 'StringValue']]
    clean_data = clean_data[['Equipment','Sensor','DateTime','Batch','Status','Value','StringValue','Delta']]

    assert leftovers.shape[0] + clean_data.shape[0] == data.shape[0]
    leftovers.to_csv(join('..', 'data', files[i][:-4] + '_leftovers.csv'), index=False)
    clean_data.to_csv(join('..', 'data', 'processed', files[i]), index=False)
    del clean_data, leftovers

We will process file by file (day by day). Some batches can start on one day and end only in the next day, that's why after processing one day we will have some leftovers to be used when processing the next day:

In [4]:
for i in range(len(files)):
    data_path = join('..','data',files[i])
    if i > 0:
        leftovers_path = join('..','data',files[i-1][:-4]+'_leftovers.csv') 
    else:
        leftovers_path = None
    process_file(data_path, leftovers_path)

ValueError: 'DateTime' is not in list