In [None]:
# to unzip the input data
!gzip -d <file path>

In [None]:
from google.colab import drive
drive.mount("/content/drive")

### Data preprocessing

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf

In [None]:
trainData = pd.read_csv('drive/MyDrive/grad/testData.csv', header=None)
trainData.columns = ["timestamp","sector_id", "# of blocks"]
trainData.head()

In [None]:
# Average access time interval - AATI
trainDataClustered = pd.read_csv("drive/MyDrive/grad/testDataClustered.csv")
trainDataClustered.columns = ["sector_id", "frequency", "AATI", "time_interval_std", "# of blocks", "cluster"]
trainDataClustered.head()

In [None]:
# train label size
trainLabelSize = trainDataClustered.shape[0]
trainLabelSize

In [None]:
hashmap_cold = {}
hashmap_cool = {}
hashmap_warm = {}
trainClusterSize = trainDataClustered.shape[0]

# the clustered data will be divided like following lines
# cluster 1,3 -> warm
# cluster 2 -> cold
# cluster 0 ->  cool
for i in range(trainClusterSize):
  if trainDataClustered["cluster"][i] == 3 or trainDataClustered["cluster"][i] == 1:
    hashmap_cold[trainDataClustered["sector_id"][i]] = 0
    hashmap_cool[trainDataClustered["sector_id"][i]] = 0
    hashmap_warm[trainDataClustered["sector_id"][i]] = 1
  elif trainDataClustered["cluster"][i] == 2:
    hashmap_cold[trainDataClustered["sector_id"][i]] = 1
    hashmap_cool[trainDataClustered["sector_id"][i]] = 0
    hashmap_warm[trainDataClustered["sector_id"][i]] = 0
  else:
    hashmap_cold[trainDataClustered["sector_id"][i]] = 0
    hashmap_cool[trainDataClustered["sector_id"][i]] = 1
    hashmap_warm[trainDataClustered["sector_id"][i]] = 0
    
# checking NaN (Not a Number) exists or not
trainData["cold"] = [hashmap_cold[sectorId] if sectorId in hashmap_cold else np.nan for sectorId in trainData["sector_id"]]
trainData["cool"] = [hashmap_cool[sectorId] if sectorId in hashmap_cool else np.nan for sectorId in trainData["sector_id"]]
trainData["warm"] = [hashmap_warm[sectorId] if sectorId in hashmap_warm else np.nan for sectorId in trainData["sector_id"]]

In [None]:
assert not np.any(np.isnan(trainData["cold"]))
assert not np.any(np.isnan(trainData["cool"]))
assert not np.any(np.isnan(trainData["warm"]))

New feature for timestamp difference

In [None]:
trainDataSize = trainData.shape[0]
print(trainDataSize)

lastSectorEncounter = {}
timeDelta = [0] * trainDataSize

for i in range(trainDataSize):
  
  sectorId = trainData["sector_id"][i]
  if sectorId in lastSectorEncounter:
    timeDelta[i] = trainData["timestamp"][i] - lastSectorEncounter[sectorId]
  else:
    timeDelta[i] = 0
  lastSectorEncounter[sectorId] = trainData["timestamp"][i]

trainData["time_delta"] = timeDelta
# trainData["timestamp_delta"] = [trainData["timestamp"][i] - lastSectorEncounter if trainData["sector_id"][i] in lastSectorEncounter else  for i in range(trainDataSize)]


In [None]:
trainDeltaMean = trainData["time_delta"].mean()
trainDeltaStd = trainData["time_delta"].std()

trainData["time_delta"] = (trainData["time_delta"] - trainDeltaMean) / trainDeltaStd

In [None]:
print("trainDataMean: ",trainDeltaMean)
print("trainDataStd: ",trainDeltaStd)

trainData.shape[0]

In [None]:
# spliting index on data
# train data - 60%, validation data - 20%, test data - 20%
split = int(trainData.shape[0] * 0.6)
testSplit = int(trainData.shape[0] * 0.2)

In [None]:
windowTrain = trainData[["sector_id","time_delta", "warm", "cool", "cold"]].iloc[ : split].reset_index(drop=True)
windowTest = trainData[["sector_id","time_delta", "warm", "cool", "cold"]].iloc[split : split + testSplit].reset_index(drop=True)
windowVal = trainData[["sector_id","time_delta", "warm", "cool", "cold"]].iloc[split + testSplit : ].reset_index(drop=True)

In [None]:
windowTrainSize = windowTrain.shape[0]

warmPercentage = windowTrain["warm"].value_counts()[1] / windowTrain.shape[0]
coolPercentage = windowTrain["cool"].value_counts()[1] / windowTrain.shape[0]
coldPercentage = windowTrain["cold"].value_counts()[1] / windowTrain.shape[0]

print(coldPercentage)
print(coolPercentage)
print(warmPercentage)

In [None]:
class_weight = {0: 60, 1: 15, 2: 1}

### Save splitted data


In [None]:
pdTrain = pd.DataFrame(trainData.iloc[ : split].reset_index(drop=True))
pdTest = pd.DataFrame(trainData.iloc[split : split + testSplit].reset_index(drop=True))
pdValidate = pd.DataFrame(trainData.iloc[split + testSplit : ].reset_index(drop=True))

In [None]:
pdTrain.to_csv("drive/MyDrive/grad/Data/trainData.csv", index=False)
pdTest.to_csv("drive/MyDrive/grad/Data/testData.csv", index=False)
pdValidate.to_csv("drive/MyDrive/grad/Data/valData.csv", index=False)

### BATCH_SIZE WINDOW_SIZE

In [None]:
BATCH_SIZE = 256
WINDOW_SIZE = 128

In [None]:
class WindowGenerator():
  def __init__(self, input_width, label_width, shift,
               train_df=windowTrain,val_df=windowVal, test_df=windowTest,
               label_columns=None, input_columns=None):
    # Store the raw data.
    self.train_df = train_df
    self.val_df = val_df
    self.test_df = test_df

    # Work out the label and input column indices.
    self.label_columns = label_columns
    self.input_columns = input_columns

    if label_columns is not None:
      self.label_columns_indices = {name: i for i, name in
                                    enumerate(label_columns)}
    if input_columns is not None:
      self.input_columns_indices = {name: i for i, name in
                                    enumerate(input_columns)}

    self.column_indices = {name: i for i, name in
                           enumerate(train_df.columns)}


    # Work out the window parameters.
    self.input_width = input_width
    self.label_width = label_width
    self.shift = shift

    self.total_window_size = input_width + shift

    self.input_slice = slice(0, input_width)
    self.input_indices = np.arange(self.total_window_size)[self.input_slice]

    self.label_start = self.total_window_size - self.label_width
    self.labels_slice = slice(self.label_start, None)
    self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

  def __repr__(self):
    return '\n'.join([
        f'Total window size: {self.total_window_size}',
        f'Input indices: {self.input_indices}',
        f'Label indices: {self.label_indices}',
        f'Label column name(s): {self.label_columns}'])

  def split_window(self, features):
    inputs = features[:, self.input_slice, :]
    labels = features[:, self.labels_slice, :]
    if self.label_columns is not None:
      labels = tf.stack(
          [labels[:, :, self.column_indices[name]] for name in self.label_columns],
          axis=-1)

    if self.input_columns is not None:
      inputs = tf.stack(
          [inputs[:, :, self.column_indices[name]] for name in self.input_columns],
          axis=-1)
  
    # Slicing doesn't preserve static shape information, so set the shapes
    # manually. This way the `tf.data.Datasets` are easier to inspect.
    inputs.set_shape([None, self.input_width, None])
    labels.set_shape([None, self.label_width, None])
  
    return inputs, labels

  def make_dataset(self, data):
    data = np.array(data, dtype=np.float32)
    ds = tf.keras.utils.timeseries_dataset_from_array(
        data=data,
        targets=None,
        sequence_length=self.total_window_size,
        sequence_stride=1,
        shuffle=False,
        batch_size=BATCH_SIZE,
    )

    ds = ds.map(self.split_window)

    return ds
  
  @property
  def train(self):
    return self.make_dataset(self.train_df)
  
  @property
  def val(self):
    return self.make_dataset(self.val_df)

  @property
  def test(self):
    return self.make_dataset(self.test_df)
  
  @property
  def example(self):
    """Get and cache an example batch of `inputs, labels` for plotting. """
    result = getattr(self, '_example', None)
    if result is None:
      # No example batch was found, so get from the `.train` dataset"""
      result = next(iter(self.train))
      # And cache it for next time
      self._example = result
    return result


In [None]:
window = WindowGenerator(WINDOW_SIZE, 1, 0, input_columns=["sector_id", "time_delta"], label_columns=["cold", "cool", "warm",])

### Model Implementation

Normalization on features using tf.Normalization

In [None]:
def datasetConverter(dataset):
  def converter(features, labels):
    return (features[:,:,0:1], features[:,:,1:2]), tf.reshape(labels, [tf.shape(labels)[0], 3])
  
  res = dataset.map(converter)
  return res

In [None]:
# embedding the sector number from 0-1
# input_shape=(number of time steps, number of features)
n_steps = WINDOW_SIZE
n_features = 2

embedInput = tf.keras.layers.Input(shape=(n_steps,))
blocksInput = tf.keras.layers.Input(shape=(n_steps,))

integerLookupLayer = tf.keras.layers.IntegerLookup(max_tokens=trainLabelSize)
integerLookupLayer.adapt(windowTrain["sector_id"])
integerMap = integerLookupLayer(embedInput)
emb = tf.keras.layers.Embedding(input_dim=trainLabelSize, output_dim=64, input_length=n_steps)(integerMap)

reg = tf.keras.regularizers.l2(0.01)

lstm = tf.keras.layers.LSTM(64, kernel_regularizer=reg, 
                            recurrent_regularizer=reg, 
                            bias_regularizer=reg, return_sequences=True)(emb)
                            
lstm = tf.keras.layers.LSTM(64, kernel_regularizer=reg, 
                            recurrent_regularizer=reg, 
                            bias_regularizer=reg)(lstm)

dropout = tf.keras.layers.Dropout(0.2)(lstm)

flatten = tf.keras.layers.Flatten()(dropout)
conc = tf.keras.layers.Concatenate()([flatten, blocksInput])

out = tf.keras.layers.Dense(3, activation='softmax')(conc)
model = tf.keras.Model(inputs=(embedInput,blocksInput), outputs=out)

model.summary()

In [None]:
MAX_EPOCHS = 20
def compile_and_fit(model, window, patience=2, epochs=MAX_EPOCHS):
  early_stopping = tf.keras.callbacks.EarlyStopping('loss', patience=patience, mode='min')

  model.compile(loss=tf.keras.losses.CategoricalCrossentropy(), 
                optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])

  trainDataset = datasetConverter(window.train)
  
  validationDataset = datasetConverter(window.val)

  history = model.fit(trainDataset, epochs=epochs, 
                      callbacks=[early_stopping], validation_data=validationDataset, class_weight=class_weight)

  return history
  

In [None]:
history = compile_and_fit(model, window)

### Save Model

In [None]:
# saving a file in given path
model.save("drive/MyDrive/grad/stacked_lstm_weight6015", save_format='tf')

In [None]:
# used for simulating
!saved_model_cli show --dir drive/MyDrive/grad/stacked_lstm_weight6015 --tag_set serve --signature_def serving_default 

### Load model and fit


In [None]:
LoadedModel = tf.keras.models.load_model('drive/MyDrive/grad/stacked_lstm_weight6015/')

In [None]:
results = LoadedModel.evaluate(datasetConverter(window.test))

In [None]:
predictions = LoadedModel.predict(datasetConverter(window.test))

### Evaluation

In [None]:
results = model.evaluate(datasetConverter(window.test))
print(results)

In [None]:
predictions = model.predict(datasetConverter(window.test))

In [None]:
correct = 0
pSize = predictions.shape[0]
y_hat = np.argmax(predictions, axis = 1)

cold = 0
cool = 0
warm = 0

for i in range(pSize):
  if y_hat[i] == 0 and windowTest["cold"][i + WINDOW_SIZE - 1] == 1:
    correct += 1
    cold += 1

  if y_hat[i] == 1 and windowTest["cool"][i + WINDOW_SIZE - 1] == 1:
    correct += 1
    cool += 1

  if y_hat[i] == 2 and windowTest["warm"][i + WINDOW_SIZE - 1] == 1:
    correct += 1
    warm += 1

print("Accuracy: ", correct / pSize)
print("Cold: ", cold / windowTest["cold"].value_counts()[1])
print("Cool: ", cool / windowTest["cool"].value_counts()[1])
print("Warm: ", warm / windowTest["warm"].value_counts()[1])

Plotting

In [None]:
# the histogram of the data
n, bins, patches = plt.hist(y_hat, 10, density=True, facecolor='g', alpha=0.75)
plt.xlabel('Probability')
plt.ylabel('Density')
plt.title('Histogram of model prediction')
plt.xlim(0, 3)
plt.grid(True)
plt.show()