In [12]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.preprocessing import StandardScaler

In [13]:
train_file = Path('/srv/newpenny/dataset/TASI/sentinel/sentinel_4s/train_data.pkl')
test_file = Path('/srv/newpenny/dataset/TASI/sentinel/sentinel_4s/test_data.pkl')

data_train = pd.read_pickle(train_file.as_posix())
data_test = pd.read_pickle(test_file.as_posix())

In [14]:
data_test

Unnamed: 0,RW1_motcurr,RW1_therm,RW1_cmd_volt,RW1_speed,RW2_motcurr,RW2_therm,RW2_cmd_volt,RW2_speed,RW3_motcurr,RW3_therm,RW3_cmd_volt,RW3_speed,RW4_motcurr,RW4_therm,RW4_cmd_volt,RW4_speed
2019-01-01 00:00:00+00:00,0.143698,19.822944,0.901933,-57.089815,0.154264,26.665519,0.875168,47.012056,0.248599,28.441824,1.497343,-50.782758,0.286130,25.203875,1.723036,77.646470
2019-01-01 00:00:04+00:00,0.133209,19.837770,0.890670,-57.161522,0.164349,26.667529,0.987338,46.958004,0.234747,28.416687,1.398484,-50.785111,0.291597,25.185758,1.715434,77.624381
2019-01-01 00:00:08+00:00,0.119799,19.824966,0.719958,-57.203562,0.168705,26.607411,1.024555,46.947101,0.232905,28.481586,1.378864,-50.756915,0.316466,25.166790,1.867196,77.628379
2019-01-01 00:00:12+00:00,0.132529,19.816382,0.711290,-57.152502,0.170645,26.582770,1.023443,46.955305,0.231101,28.435541,1.383205,-50.705407,0.297068,25.183504,1.877296,77.708077
2019-01-01 00:00:16+00:00,0.152830,19.834569,0.903395,-57.138197,0.159694,26.650811,0.992920,46.971512,0.243128,28.426825,1.426045,-50.658641,0.267810,25.170745,1.678478,77.759308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-31 23:59:40+00:00,0.106900,19.553857,0.658909,-45.410752,0.152181,27.210290,1.013586,50.988018,0.427172,30.869796,2.493936,-46.724841,0.246901,24.959493,1.397427,91.979048
2019-12-31 23:59:44+00:00,0.141220,19.526584,0.854479,-45.515055,0.180061,27.176727,1.091865,50.955099,0.392090,30.805485,2.343598,-46.811992,0.217272,24.951728,1.287708,91.801477
2019-12-31 23:59:48+00:00,0.127480,19.540385,0.853567,-45.711044,0.171198,27.214938,1.078935,50.964217,0.404629,30.792341,2.350024,-46.825043,0.227121,24.973121,1.293958,91.586049
2019-12-31 23:59:52+00:00,0.105669,19.567721,0.672513,-45.927422,0.152912,27.229394,0.989241,50.976838,0.429403,30.865748,2.513801,-46.842002,0.232635,24.979797,1.401935,91.360252


In [15]:
labels_train = pd.read_pickle('/srv/newpenny/dataset/TASI/sentinel/sentinel_4s/train_labels.pkl')
labels_test = pd.read_pickle('/srv/newpenny/dataset/TASI/sentinel/sentinel_4s/test_labels.pkl')

In [16]:
data_test.index = pd.to_datetime(data_test.index)
labels_test.index = pd.to_datetime(labels_test.index)

In [17]:
data_test = data_test[~((data_test.index.year == 2019) & (data_test.index.month >= 4))]

labels_test = labels_test[~((labels_test.index.year == 2019) & (labels_test.index.month >= 4))]

In [18]:
def apply_margin_to_labels(labels, margin=None, inplace=False):
    '''
    Given 
    
    Parameters
    ----------
    labels: pandas.Series,
        labels with an index of pandas.Timestamp
    margin: pandas.Timedelta, , default False
        margin to apply before and after each period corresponding to True
        labels.
    inplace: bool, default False
        Whether to modify the Series rather than creating a new one.
    
    Returns
    -------
    pandas.Series 
        labels in which margins are applyied to True labels periods.  
    '''
        
    if margin is None:
        return labels
    
    y = labels if inplace else labels.copy()
    
    margin = pd.Timedelta(margin)
    
    # finding timestamp corresponding to a start and end of a period
    # corresponding a true label
    diff = y.astype(int).diff()
    starts = diff[diff == 1].index
    ends = diff[diff == -1].index
    
    # handling the case in which labels starts or ends with a True label
    if starts[0] > ends[0]:
        starts = pd.Index([y.index[0], *starts])
    if starts[-1] > ends[-1]:
        ends = pd.Index([*ends, y.index[-1]])
        
    # apply margin to timestamps
    starts = starts - margin
    ends = ends + margin
    
    # set labels to True
    for start, end in zip(starts, ends):
        y[start:end] = True
    
    return y

In [19]:
margin = pd.Timedelta('10m')

In [20]:
y = labels_train.any(axis=1)
print(y)
y_margin = apply_margin_to_labels(y, margin=margin)

2016-11-01 00:00:04+00:00     True
2016-11-01 00:00:08+00:00    False
2016-11-01 00:00:12+00:00    False
2016-11-01 00:00:16+00:00    False
2016-11-01 00:00:20+00:00    False
                             ...  
2018-12-31 23:59:40+00:00    False
2018-12-31 23:59:44+00:00    False
2018-12-31 23:59:48+00:00    False
2018-12-31 23:59:52+00:00    False
2018-12-31 23:59:56+00:00    False
Length: 17085599, dtype: bool


In [21]:
data_train_clean = data_train.copy()
data_train_clean[y_margin] = np.nan

In [22]:
y = labels_test.any(axis=1)
print(y)
y_margin = apply_margin_to_labels(y, margin=margin)

2019-01-01 00:00:00+00:00    False
2019-01-01 00:00:04+00:00    False
2019-01-01 00:00:08+00:00    False
2019-01-01 00:00:12+00:00    False
2019-01-01 00:00:16+00:00    False
                             ...  
2019-03-31 23:59:40+00:00    False
2019-03-31 23:59:44+00:00    False
2019-03-31 23:59:48+00:00    False
2019-03-31 23:59:52+00:00    False
2019-03-31 23:59:56+00:00    False
Length: 1944000, dtype: bool


In [23]:
data_test_clean = data_test.copy()
data_test_clean[y_margin] = np.nan

In [24]:
scaler = StandardScaler().fit(data_train_clean)

In [25]:
data_train_std = pd.DataFrame(
    data=scaler.transform(data_train_clean),
    index=data_train_clean.index,
    columns=data_train_clean.columns,
)

data_test_std = pd.DataFrame(
    data=scaler.transform(data_test_clean),
    index=data_test_clean.index,
    columns=data_test_clean.columns,
)

In [26]:
data_test_std.isna().sum()

RW1_motcurr     14626
RW1_therm       14626
RW1_cmd_volt    14626
RW1_speed       14626
RW2_motcurr     14626
RW2_therm       14626
RW2_cmd_volt    14626
RW2_speed       14626
RW3_motcurr     14626
RW3_therm       14626
RW3_cmd_volt    14626
RW3_speed       14626
RW4_motcurr     14626
RW4_therm       14626
RW4_cmd_volt    14626
RW4_speed       14626
dtype: int64

In [27]:
data_test_std.to_pickle("/srv/newpenny/dataset/TASI/sentinel/sentinel_4s_clean_std/test_data_clean.pkl")

In [28]:
labels_test.to_pickle("/srv/newpenny/dataset/TASI/sentinel/sentinel_4s_clean_std/test_labels_clean.pkl")