In [173]:
import xarray as xr
import pandas as pd

dataset = xr.open_dataset('MiningProcess_Flotation_Plant_Database.h5', engine = 'h5netcdf')
df = dataset.to_pandas()

In [174]:
## Resampling
df = df.resample('1h').mean()


## Split into input and output
X = df.iloc[:,:-2]
Y = df['% Silica Concentrate']

In [176]:
# Adding lagged variables
import numpy as np


X_copy = X.copy()
lagged_variables = {}

# For the output variable
for lag in range(1,10):
    lagged_variable = np.zeros(len(X_copy))
    lagged_variable[lag:] = Y.iloc[:-lag]
    lagged_variables[f'% Silica Concentrate Lag {lag}'] = lagged_variable

# For other input variables
for variable in X_copy.columns:
    X_variable = X_copy[variable]
    for lag in range(1,10):
        lagged_variable = np.zeros(len(X_variable))
        lagged_variable[lag:] = X_variable.iloc[:-lag]
        lagged_variables[f'{variable} Lag {lag}'] = lagged_variable

X_lagged_variables = pd.DataFrame(lagged_variables,index=X_copy.index)

X = X.join(X_lagged_variables)
X = X[10:] # Remove first measures that dont have lagged values

In [177]:
# Only include wanted variables
remove_variables = True
N = 20
if remove_variables:
    variables = pd.read_csv('variables.csv')
    X = X.loc[:,variables['Column'].iloc[:N]]


In [178]:
# Split to calibration-validation set and test set
X_cal = X['20170615':'20170723']
Y_cal = Y['20170615':'20170723']

X_test = X['20170426':'20170506']
Y_test = Y['20170426':'20170506']


In [180]:
# Store data
with pd.HDFStore('MiningProcess_Flotation_Plant_Database_train_split_variables.h5') as store:
    store.put('X_cal', X_cal)
    store.put('Y_cal', Y_cal)

    store.put('X_test', X_test)
    store.put('Y_test', Y_test)

