First load imports needed for the project

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import csv
import torch.nn as nn
import matplotlib.pyplot as plt
import utils_bsc

Now, we should create a dataset with all the data stored in the .csv file

Description of the data:

time: Timestamp in format YYYY-MM-DD HH:MM:SS,
PLN1:
PLN2:
PLN3:
ULL1:
ULL3:
COS_PHI1:
COS_PHI2:
COS_PHI3:
FREQ:
RC_DC:
RC_AC:
RC_50Hz:
RC_150Hz:
RC_<100Hz:
RC_100Hz-1kHz:
RC_>10kHz:

In [None]:
dataset = pd.read_csv('data_factory.csv')
dataset.head()

Once we have the dataset, we should prepare it. Finding the missing or the NaN values and replace them with suitable values (in this case we use the previous value).

In [None]:
# Replace all mising values with NaN
dataset = dataset.replace(' ', np.nan)
# Search for all the rows with NaN values
nan_values = dataset[dataset.isna().any(axis=1)]
# Print the shape to know how many are there
print(f'Number of rows with NaN values before cleaning: {nan_values.shape[0]}') 

# Fill all NaN values with the previous row value
dataset_clean = dataset.fillna(method='ffill')

# Check that there isn't any NaN values
nan_values = dataset_clean[dataset_clean.isna().any(axis=1)]
# Print the shape to know how many are there
print(f'Number of rows with NaN values after cleaning: {nan_values.shape[0]}') 

#Total number of samples
print(f'Total number of samples: {dataset_clean.shape[0]}')
print(f'Number of features: {dataset_clean.shape[1]}')


Now we look at the distribution of the different features of the data over different time intervals

In [None]:
# PLN_1 in a weekly interval

utils_bsc.week_plot(dataset_clean, 1, 'PLN_1')

# PLN_1 in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 1, 'PLN_1')

In [None]:
# PLN_2 in a weekly interval

utils_bsc.week_plot(dataset_clean, 2, 'PLN_2')

# PLN_2 in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 2, 'PLN_2')

In [None]:
# PLN_3 in a weekly interval

utils_bsc.week_plot(dataset_clean, 3, 'PLN_3')

# PLN_3 in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 3, 'PLN_3')

In [None]:
# ULL1 in a weekly interval

utils_bsc.week_plot(dataset_clean, 4, 'ULL1')

# ULL1 in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 4, 'ULL1')

In [None]:
# ULL2 in a weekly interval

utils_bsc.week_plot(dataset_clean, 5, 'ULL2')

# ULL2 in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 5, 'ULL2')

In [None]:
# ULL3 in a weekly interval

utils_bsc.week_plot(dataset_clean, 6, 'ULL3')

# ULL3 in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 6, 'ULL3')

In [None]:
# COS_PHI1 in a weekly interval

utils_bsc.week_plot(dataset_clean, 7, 'COS_PHI1')

# COS_PHI1 in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 7, 'COS_PHI1')

In [None]:
# COS_PHI2 in a weekly interval

utils_bsc.week_plot(dataset_clean, 8, 'COS_PHI2')

# COS_PHI2 in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 8, 'COS_PHI2')

In [None]:
# COS_PHI3 in a weekly interval

utils_bsc.week_plot(dataset_clean, 9, 'COS_PHI3')

# COS_PHI3 in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 9, 'COS_PHI3')

In [None]:
# FREQ in a weekly interval

utils_bsc.week_plot(dataset_clean, 10, 'FREQ')

# FREQ in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 10, 'FREQ')

In [None]:
# RC_DC in a weekly interval

utils_bsc.week_plot(dataset_clean, 11, 'RC_DC')

# RC_DC in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 11, 'RC_DC')

In [None]:
# RC_AC in a weekly interval

utils_bsc.week_plot(dataset_clean, 12, 'RC_AC')

# RC_AC in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 12, 'RC_AC')

In [None]:
# RC_50Hz in a weekly interval

utils_bsc.week_plot(dataset_clean, 13, 'RC_50Hz')

# RC_50Hz in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 13, 'RC_50Hz')

In [None]:
# RC_150Hz in a weekly interval

utils_bsc.week_plot(dataset_clean, 14, 'RC_150Hz')

# RC_150Hz in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 14, 'RC_150Hz')

In [None]:
# RC_100Hz_1kHz in a weekly interval

utils_bsc.week_plot(dataset_clean, 15, 'RC_100Hz_1kHz')

# RC_100Hz_1kHz in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 15, 'RC_100Hz_1kHz')

In [None]:
# RC_100Hz_1kHz in a weekly interval

utils_bsc.week_plot(dataset_clean, 16, 'RC_100Hz_1kHz')

# RC_100Hz_1kHz in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 16, 'RC_100Hz_1kHz')

In [None]:
# RC_more_1kHz in a weekly interval

utils_bsc.week_plot(dataset_clean, 17, 'RC_more_1kHz')

# RC_more_1kHz in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 17, 'RC_more_1kHz')

In [None]:
# RC_more_10kHz in a weekly interval

utils_bsc.week_plot(dataset_clean, 18, 'RC_more_10kHz')

# RC_more_10kHz in a daily interval (only the values of weekdays between 4:00 and 19:30)

utils_bsc.daily_plot(dataset_clean, 18, 'RC_more_10kHz')

Once the dataset is prepared, make batches of data,put them togheter in an array and split them into train and test sets.
After looking through the dataset and the features, i decided to takeonly the values with a timestap of a weekday between 4:00 and 19:30. In many of the features in the interval otside those timestamps there i only noise, which can be a sign that the machine is off in that time interval.

In [None]:
# Create 63300 batches of longitud 60

data = []

for i in range(0, len(dataset) - 60):
    data_set = dataset_clean.iloc[i:i+60, 1:]
    data.append(data_set)
    
data = np.array(data)
print(f'{data.shape[0]} sequences of longitud {data.shape[1]} with {data.shape[2]} features')

# Spliting into train and test sets

training_data, testing_data = train_test_split(data, test_size=0.2, random_state=25)
print(f'length of training set: {training_data.shape[0]}')
print(f'length of test set: {testing_data.shape[0]}')

Now, we define a class with the transformer model that we are going to use:

Using the already written pytorch library for Transformers:

torch.nn.TransformerEncoderLayer

d_model –> the number of expected features in the input (required).

nhead –> the number of heads in the multiheadattention models (required).

dim_feedforward –> the dimension of the feedforward network model (default=2048).

dropout –> the dropout value (default=0.1).

activation –> the activation function of the intermediate layer, can be a string (“relu” or “gelu”) or a unary callable. Default: relu

layer_norm_eps –> the eps value in layer normalization components (default=1e-5).

batch_first –> If True, then the input and output tensors are provided as (batch, seq, feature). Default: False.

norm_first –> if True, layer norm is done prior to attention and feedforward operations, respectivaly. Otherwise it’s done after. Default: False (after).

In [None]:
class Transformer(nn.Module):
    def __init__(self, feature_size, output_size, num_encoder_layers, dropout):
        super(Transformer, self).__init__()
        
        encoder_layer = nn.TransformerEncoderLayer(d_model= feature_size, nhead= feature_size, dropout= dropout)
        
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers= num_encoder_layers)
        
        self.decoder = nn.Linear (feature_size, output_size)
        
    def forward (self, src, device):
        mask = None
        output = self.encoder (src, mask)
        output = self.decoder (output, output_size)
        return output

In [None]:
transformer = Transformer(training_data.shape[2], training_data.shape[2], 4, 0)

transformer.forward(training_data, testing_data)