In [1]:
import numpy as np
import pandas as pd

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Application-specific Functions

Train - Val Split

In [5]:
# Accepts a data frame of run-to-failure instances from multiple similar machines
from numpy.random import default_rng

rng = default_rng()
def train_val_split(df,val_percent):
  all_instances = pd.unique(df['unit_id'])
  num_val = int(np.floor(val_percent*len(all_instances)))
  val_instances = (rng.choice(max(all_instances), size=num_val, replace=False))+1
  # val_instances = np.random.randint(low=1, high=max(all_instances)+1, size=num_val)
  train_instances = np.argwhere(np.isin(all_instances, val_instances, invert=True))+1

  # Validation set
  frames = []
  for instance in val_instances:
    z = df[df.unit_id == instance]
    frames.append(z)
  val_df = pd.concat(frames)

  # Train set
  frames = []
  for instance in np.squeeze(train_instances):
    z = df[df.unit_id == instance]
    frames.append(z)
  train_df = pd.concat(frames)

  return train_df, val_df, train_instances, val_instances


Clustering

Predict the cluster each sample in the dataset belongs to with pre-trained k-means clustering model.






Dataframe

In [6]:
# Create clusters with pre-trained k-means clustering model

def kMeansClustering(kmeans_model, data, op_condts_labels):
  # Let's predict the cluster each sample in the dataset belongs to
  cluster_labels = kmeans_model.predict(data[op_condts_labels])
  return cluster_labels

Normalize by cluster


1.   Find and store the mean and std of each cluster of the training dataset in a numpy array.
1.   Normalize train, val and test datasets by clusters mean and std (Standardization)

In [8]:
# Cluster parameters function
no_clusters=6
def parameters_form(no_clusters, data, cluster_labels):
  parameters_mean_list = []
  parameters_std_list = []
  for label in range(no_clusters):
    cluster = data[cluster_labels == label]
    meaan = np.mean(cluster, axis = 0)
    stdd = np.std(cluster, axis = 0)
    parameters_mean_list.append(meaan)
    parameters_std_list.append(stdd)
  print(len(parameters_mean_list), len(parameters_std_list))
  return parameters_mean_list, parameters_std_list

# Normalize function
def normalize(X,mean,std):
  return (X - mean) / std

# Normalized data
def normalize_regime(temp_data, cluster_labels, parameters_mean_list, parameters_std_list):
  # New Normalized with clusters dataset before split
  normalized_unwrap_train_data = np.zeros_like(temp_data)
  for clu in np.unique(cluster_labels):
    mm = normalize(temp_data[np.argwhere(cluster_labels==clu)].squeeze(),parameters_mean_list[clu],parameters_std_list[clu])
    # Fill newdataset @ every iteration
    normalized_unwrap_train_data [np.argwhere(cluster_labels==clu).squeeze()] = mm
  return normalized_unwrap_train_data

Feature extraction

In [16]:
# Accepts a numpy array and extracts desired columns.

def featureExtraction(data, desired_sensors):
  cols = desired_sensors+4
  desired_cols = np.append(np.array([0]), cols)
  data = data[:,[desired_cols]]
  return data

Expanding Window: Varying Sequence Lenghts (Train and Val)

In [10]:
"""Build a list of numpy arrays with varying lengths.
Args:
    data: normalized dataset with relevant features
        - X: numpy array of shape (T, feat_dim)
Returns:
    X: (len(list), varying lenght (T), feat_dim) list of arrays
"""
def expandWindow(data, min_len=5):
  ensem_list = []
  for i in np.unique(data[:,0]):
    x_ens = data[data[:,0] == i]

    # expanding window
    # min_len = min_len
    ens_len = x_ens.shape[0]
    ens_max_len = ens_len - min_len + 1

    start = 0
    while start < ens_max_len:
      ensem_list.append(x_ens[:start+min_len, 1:])
      start+=1
  return ensem_list

Sliding Window: Varying Sequence Lenghts (Train and Val)

In [None]:
def slidingWindow(data, T):
  ensem_list = []
  for i in np.unique(data[:,0]):
    x_ens = data[data[:,0] == i]

    # sliding window
    ens_len = x_ens.shape[0]
    ens_max_len = ens_len - T + 1

    start = 0
    while start < ens_max_len:
      ensem_list.append(x_ens[start:start+T, 1:])
      start+=1
  return ensem_list


Expanding and Sliding window processes for test dataset

In [None]:
def testExpandWindow(data):
  x_test_list = []
  for i in np.unique(data[:,0]):
    dat = data[data[:,0]==i]
    dat = dat[:,1:]
    x_test_list.append(dat)
  return x_test_list

def testSlidingWindow(data, T):
  x_test_list = []
  for i in np.unique(data[:,0]):
    dat = data[data[:,0]==i]
    dat = dat[-T:,1:]
    x_test_list.append(dat)
  return x_test_list

Expanding Window - Y prep

In [11]:
# Y-prep min_len/T
def yPrep(data, min_len):
  y_list = []
  for pe in np.unique(data[:,0]):
    y_temp = data[data[:,0]==pe][min_len-1:,-1]
    y_list.append(y_temp)
  return y_list

In [12]:
# total no of new dataset
def checkY(y_list):
  su = 0
  for m in y_list:
    su += len(m)
  return su

# General

Unsupervised pre-training Dataset Class and functions

In [13]:
# It accepts a list of arrays of varying sequence lengths and targets
class supDataset(Dataset):
  def __init__(self, data_list, targets):
    self.data_list = data_list
    self.targets = targets

  # Returns len of dataset
  def __len__(self):
    return len(self.data_list)

  # Takes indices of data len, returns a dictionary of tensors
  def __getitem__(self, idx):
    X = self.data_list[idx]
    y = self.targets[idx]
    # return X, y
    # return torch.tensor(X, dtype=torch.float),  torch.tensor(y, dtype=torch.int64)
    return torch.tensor(X, dtype=torch.float), y


In [14]:
def padding_mask(lengths, max_len=None):
    """
    Used to mask padded positions: creates a (batch_size, max_len) boolean mask from a tensor of sequence lengths,
    where 1 means the values at time step (t) are used to compute attention weights
    """
    batch_size = lengths.numel()
    max_len = max_len or lengths.max_val()  # trick works because of overloading of 'or' operator for non-boolean types
    return (torch.arange(0, max_len, device=lengths.device)
            .type_as(lengths)
            .repeat(batch_size, 1)
            .lt(lengths.unsqueeze(1)))

In [None]:
def collate_superv(data, max_len=None):
    """Build mini-batch tensors from a list of (X, y) tuples.
    Args:
        data: len(batch_size) list of tuples (X, y).
            - X: torch tensor of shape (seq_length, feat_dim); variable seq_length.
            - y: torch tensor of shape (1);
        max_len: global fixed sequence length. Used for architectures requiring fixed length input,
            where the batch length cannot vary dynamically. Longer sequences are clipped, shorter are padded with 0s
    Returns:
        X: (batch_size, padded_length, feat_dim) torch tensor of masked features (input)
        y: (batch_size,1)
        padding_masks: (batch_size, padded_length) boolean tensor, 1 means keep vector at this position, 0 ignore (padding)
    """

    batch_size = len(data)
    features, targets = zip(*data)

    # Stack and pad features and masks (convert 2D to 3D tensors, i.e. add batch dimension)
    lengths = [X.shape[0] for X in features]  # original sequence length for each time series
    if max_len is None:
        max_len = max(lengths)

    X = torch.zeros(batch_size, max_len, features[0].shape[-1])  # (batch_size, padded_length, feat_dim)

    for i in range(batch_size):
        end = min(lengths[i], max_len)
        X[i, :end, :] = features[i][:end, :]

    padding_masks = padding_mask(torch.tensor(lengths, dtype=torch.int16), max_len=max_len)  # (batch_size, padded_length) boolean tensor, "1" means keep
    # X = x.clone().detach().requires_grad_(True)
    X = X.clone().detach().requires_grad_(True).type('torch.FloatTensor')
    targets = torch.tensor(targets, dtype=torch.float).reshape(-1,1)
    return X, targets, padding_masks