In [1]:
import pandas as pd
import numpy as np

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

In [4]:
data = pd.read_csv("data_processed_plant_item_encoded_13_03_22.csv")

In [5]:
def clean_data(series):
    """Fills missing values. 
        Interpolate missing values with a linear approximation.
    """
    series_filled = series.interpolate(method='linear')
    return series_filled
        
    
def scale(X):
    mm = MinMaxScaler()
    X_ = np.atleast_2d(X)
    return pd.DataFrame(mm.fit(X_[:23,:]).transform(X_), X.index)

In [6]:
data = clean_data(data)

In [7]:
data['Date'] = pd.to_datetime(data['Date'])
data.drop(['Unnamed: 0'], axis=1, inplace=True)
for col in ['PlantID', 'ParentItemID']:
    data[col] = data[col].astype(str).astype("category")
data['Volume'] = data['Volume'].astype(float)
data.drop(['month'], axis=1, inplace=True)
data.drop(['index.1'], axis=1, inplace=True)
data["month"] = data.Date.dt.month.astype(str).astype("category")  # categories have be strings

In [8]:
# Split the Validation and Train dataset by time_index
validations = data[data['time_idx'].isin(list(range(6, 35)))]
train = data[data['time_idx'].isin(list(range(0, 29)))] # generates 0 to 28

# These are the columns to minmax scale. Currently, they are ['Volume', 'log_volume', 'log_ret', 'avg_volume_by_material',
      #  'max_volume_by_material', 'min_volume_by_material',
      #  'std_volume_by_material', 'avg_volume_by_plant', 'max_volume_by_plant',
      #  'min_volume_by_plant', 'std_volume_by_plant', 'encoding_1',
      #  'encoding_2', 'encoding_3']

cols = validations.columns[[3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19]]

validations[cols + '_scaled'] = validations.groupby('timeseries')[cols].apply(scale)
validations.drop(cols, axis=1, inplace=True)
validations.drop(['Date'], axis=1, inplace=True)

train[cols + '_scaled'] = train.groupby('timeseries')[cols].apply(scale)
train.drop(cols, axis=1, inplace=True)
train.drop(['Date'], axis=1, inplace=True)

# One Hot Encode everyting that is categorical or Object # train.dtypes
train = pd.get_dummies(train)
validations = pd.get_dummies(validations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

In [9]:
print("Multivarate Datasets")
print(f"Train Data Shape: {train.shape}")
print(f"Val Data Shape: {validations.shape}")
print(f"Nulls In Train {np.any(np.isnan(train))}")
print(f"Nulls In Validation {np.any(np.isnan(validations))}")

Multivarate Datasets
Train Data Shape: (303079, 1123)
Val Data Shape: (303079, 1123)
Nulls In Train False
Nulls In Validation False


In [10]:
# Convert to 3d matrices 
validations_matr = np.array(list(validations.groupby('timeseries').apply(pd.DataFrame.to_numpy)))[:,:,3:]
train_matr = np.array(list(train.groupby('timeseries').apply(pd.DataFrame.to_numpy)))[:,:,3:]

In [11]:
np.save('validations_matr.npy', validations_matr)
np.save('train_matr.npy', train_matr)

In [77]:
validations_matr = np.load('validations_matr.npy')
train_matr = np.load('train_matr.npy')

In [78]:
validations_matr.shape

(10451, 29, 1120)

In [79]:
validations_matr[0, : , 0]

array([0.42937853, 1.        , 0.74011299, 0.59322034, 0.        ,
       0.00564972, 0.        , 0.        , 0.        , 0.        ,
       0.84745763, 0.30508475, 0.45762712, 0.63841808, 0.11864407,
       0.30225989, 0.04519774, 0.01694915, 0.01129944, 0.        ,
       0.        , 0.56497175, 0.92655367, 0.57344633, 0.        ,
       0.        , 0.        , 0.        , 0.        ])

In [87]:
ds = tf.data.Dataset.from_tensor_slices(validations_matr)
# ds = ds.batch(32).prefetch(1)
# print(f'Num. of Batchs: {len(ds)} in {ds}')

In [48]:
ds = ds.window(29, shift=6, drop_remainder=True)

In [88]:
# ds = ds.flat_map(lambda x : x.batch(29))
ds = ds.shuffle(100) 

In [89]:
ds

<ShuffleDataset shapes: (29, 1120), types: tf.float64>

In [90]:
ds = ds.map(lambda x : (x[:23, :], x[-6:, 0]))

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


In [91]:
ds

<MapDataset shapes: ((23, 1120), (6,)), types: (tf.float64, tf.float64)>

In [83]:
print(f'Num. of Batchs: {len(ds)} in {ds}')

Num. of Batchs: 10451 in <MapDataset shapes: ((23, 1120), (6,)), types: (tf.float64, tf.float64)>


In [92]:
ds = ds.batch(32)
print(f'Num. of Batchs: {len(ds)} in {ds}')

Num. of Batchs: 327 in <BatchDataset shapes: ((None, 23, 1120), (None, 6)), types: (tf.float64, tf.float64)>


In [85]:
ds = ds.shuffle(100)    

In [93]:
print(f'Num. of Batchs: {len(ds)} in {ds}')

Num. of Batchs: 327 in <BatchDataset shapes: ((None, 23, 1120), (None, 6)), types: (tf.float64, tf.float64)>


In [42]:
ds = ds.batch(32).prefetch(1)

In [44]:
print(f'Num. of Batchs: {len(ds)} in {ds}')

TypeError: dataset length is unknown.

In [25]:
dataset = tf.data.Dataset.range(8)

AttributeError: module 'tensorflow._api.v2.data' has no attribute 'validations_matr'

In [22]:
dataset

<RangeDataset shapes: (), types: tf.int64>

In [94]:
tf.data.experimental.save(ds, '')