In [292]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

In [214]:
df = pd.read_csv("../../data/csv_files/df_ph1.csv")

In [216]:
df.head()

Unnamed: 0,dt,FEX_004,FSB_095DR,FSB_078,FSB_126,FSB_130D,FPZ_06A,FSB_079,FPZ_04A,FSB_097D
0,2023-07-28 05:00:00.000,3.979692,4.231642,4.162819,4.181472,4.128811,5.273175,7.715749,6.417295,3.767151
1,2023-07-28 06:00:00.000,3.980776,4.23355,4.163253,4.181599,4.129336,5.27438,7.713223,6.418118,3.767072
2,2023-07-28 07:00:00.000,3.980825,4.236443,4.16304,4.179049,4.129939,5.27106,7.712299,6.417948,3.767076
3,2023-07-28 08:00:00.000,3.981121,4.236699,4.166883,,4.130379,5.267786,7.712222,6.417735,3.767253
4,2023-07-28 09:00:00.000,3.981221,4.239193,4.168008,,4.130503,5.265711,7.712381,6.417951,3.767425


In [218]:
df.shape

(7748, 10)

In [220]:
df['dt'] = pd.to_datetime(df['dt'])

In [222]:
cols = list(df.columns)
cols.remove('dt')
cols

['FEX_004',
 'FSB_095DR',
 'FSB_078',
 'FSB_126',
 'FSB_130D',
 'FPZ_06A',
 'FSB_079',
 'FPZ_04A',
 'FSB_097D']

In [224]:
start_array = []
end_array = []
index = 2
start_na = True
print(cols[index])

for _, row in df.iterrows():
    if pd.isna(row[cols[index]]) and start_na:
        start_na = False
        start_array.append(row['dt'])
    elif not pd.isna(row[cols[index]]) and not start_na:
        start_na = True
        end_array.append(row['dt'])

FSB_078


In [226]:
print(len(start_array), len(end_array))

31 31


In [228]:
start_end = {}
for start, end in zip(start_array, end_array):
    start_end[start] = end
print(len(start_end))

31


In [230]:
length = {}
for start, end in start_end.items():
    length[(start, end)] = pd.to_datetime(end) - pd.to_datetime(start)
print(len(length))
length

31


{(Timestamp('2023-08-28 15:02:00'),
  Timestamp('2023-08-28 16:00:00')): Timedelta('0 days 00:58:00'),
 (Timestamp('2023-09-08 00:02:00'),
  Timestamp('2023-09-08 01:00:00')): Timedelta('0 days 00:58:00'),
 (Timestamp('2023-10-11 15:19:00'),
  Timestamp('2023-10-11 16:00:00')): Timedelta('0 days 00:41:00'),
 (Timestamp('2023-10-12 07:39:00'),
  Timestamp('2023-10-12 08:00:00')): Timedelta('0 days 00:21:00'),
 (Timestamp('2023-10-18 12:53:00'),
  Timestamp('2023-10-18 13:00:00')): Timedelta('0 days 00:07:00'),
 (Timestamp('2023-10-18 13:34:00'),
  Timestamp('2023-10-18 14:00:00')): Timedelta('0 days 00:26:00'),
 (Timestamp('2023-10-18 14:21:00'),
  Timestamp('2023-10-18 15:00:00')): Timedelta('0 days 00:39:00'),
 (Timestamp('2023-10-18 15:13:00'),
  Timestamp('2023-10-18 16:00:00')): Timedelta('0 days 00:47:00'),
 (Timestamp('2023-10-18 16:08:00'),
  Timestamp('2023-10-18 17:00:00')): Timedelta('0 days 00:52:00'),
 (Timestamp('2023-10-19 12:52:00'),
  Timestamp('2023-10-19 13:00:00')): 

In [232]:
na_data = {}
for key, duration in length.items():
    if duration > pd.Timedelta(hours=1):
        na_data[key] = duration
print(len(na_data))

1


In [234]:
na_data

{(Timestamp('2024-01-01 01:00:00'),
  Timestamp('2024-01-24 15:22:00')): Timedelta('23 days 14:22:00')}

In [236]:
new_df = df[df['dt'] >= "2024-01-24 14:06:00.000"]
print(new_df.shape)

(3395, 10)


In [238]:
new_df.head()

Unnamed: 0,dt,FEX_004,FSB_095DR,FSB_078,FSB_126,FSB_130D,FPZ_06A,FSB_079,FPZ_04A,FSB_097D
4353,2024-01-24 14:06:00,,,,4.390264,,,,,
4354,2024-01-24 14:42:00,,,,,,,7.768566,,
4355,2024-01-24 15:00:00,4.199255,4.335871,,4.390036,4.335593,,7.767843,,3.997356
4356,2024-01-24 15:22:00,,,4.33545,,,,,,
4357,2024-01-24 16:00:00,4.199346,4.335111,4.331754,4.390786,4.334991,,7.768126,7.497128,3.99647


In [240]:
new_df.isna().sum()

dt             0
FEX_004       24
FSB_095DR     24
FSB_078       24
FSB_126       34
FSB_130D      24
FPZ_06A       27
FSB_079       23
FPZ_04A       25
FSB_097D     189
dtype: int64

In [242]:
df_interpolate = new_df.copy()
cols

['FEX_004',
 'FSB_095DR',
 'FSB_078',
 'FSB_126',
 'FSB_130D',
 'FPZ_06A',
 'FSB_079',
 'FPZ_04A',
 'FSB_097D']

In [244]:
for col in cols:
    df_interpolate[col] = df_interpolate[col].interpolate()

In [246]:
df_interpolate.isna().sum()

dt           0
FEX_004      2
FSB_095DR    2
FSB_078      3
FSB_126      0
FSB_130D     2
FPZ_06A      6
FSB_079      1
FPZ_04A      4
FSB_097D     2
dtype: int64

In [248]:
test_df = df_interpolate.drop(columns = ["dt"])
test_df.shape

(3395, 9)

In [250]:
test_df = test_df.dropna()

In [252]:
print(test_df.shape)
test_df.isna().sum()

(3389, 9)


FEX_004      0
FSB_095DR    0
FSB_078      0
FSB_126      0
FSB_130D     0
FPZ_06A      0
FSB_079      0
FPZ_04A      0
FSB_097D     0
dtype: int64

In [254]:
scaler = StandardScaler()
normalized_df = scaler.fit_transform(test_df)

In [256]:
normalized_df.shape

(3389, 9)

In [277]:
tensor_data = torch.tensor(normalized_df, dtype=torch.float32)

dataset = TensorDataset(tensor_data, tensor_data)
dataloader = DataLoader(dataset, batch_size = 64, shuffle = True)

In [281]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        #encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 7),  
            nn.ReLU(),
            nn.Linear(7, 5),        
            nn.ReLU(),
            nn.Linear(5, 3),        
            nn.ReLU(),
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(3, 5),         
            nn.ReLU(),
            nn.Linear(5, 7),        
            nn.ReLU(),
            nn.Linear(7, input_dim), 
            nn.Sigmoid()               
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [283]:
input_dim = normalized_df.shape[1]
model = Autoencoder(input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

In [285]:
num_epochs = 100
losses = []
for epoch in range(num_epochs):
    for data in dataloader:
        inputs, _ = data
        
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, inputs)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    #appending the error to the losses array
    losses.append(loss.item())

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/100], Loss: 1.2930
Epoch [2/100], Loss: 1.2097
Epoch [3/100], Loss: 1.2419
Epoch [4/100], Loss: 1.0916
Epoch [5/100], Loss: 0.8198
Epoch [6/100], Loss: 0.8240
Epoch [7/100], Loss: 0.8026
Epoch [8/100], Loss: 0.7442
Epoch [9/100], Loss: 0.7338
Epoch [10/100], Loss: 0.7305
Epoch [11/100], Loss: 0.6967
Epoch [12/100], Loss: 0.7431
Epoch [13/100], Loss: 0.7818
Epoch [14/100], Loss: 0.8588
Epoch [15/100], Loss: 0.7150
Epoch [16/100], Loss: 0.7396
Epoch [17/100], Loss: 0.6122
Epoch [18/100], Loss: 0.6983
Epoch [19/100], Loss: 0.7232
Epoch [20/100], Loss: 0.7333
Epoch [21/100], Loss: 0.6690
Epoch [22/100], Loss: 0.6389
Epoch [23/100], Loss: 0.7296
Epoch [24/100], Loss: 0.7065
Epoch [25/100], Loss: 0.7236
Epoch [26/100], Loss: 0.7249
Epoch [27/100], Loss: 0.6560
Epoch [28/100], Loss: 0.7418
Epoch [29/100], Loss: 0.6226
Epoch [30/100], Loss: 0.7183
Epoch [31/100], Loss: 0.6815
Epoch [32/100], Loss: 0.6954
Epoch [33/100], Loss: 0.7509
Epoch [34/100], Loss: 0.6324
Epoch [35/100], Loss: 0

In [288]:
model.eval()
with torch.no_grad():
    reconstructed_data = model(tensor_data)
    reconstruction_error = torch.mean((tensor_data - reconstructed_data) ** 2, axis=1).numpy()
print(len(reconstruction_error))

3389


In [294]:
threshold = np.mean(reconstruction_error) + 2 * np.std(reconstruction_error)
threshold

1.5951984524726868

In [300]:
anomalies = reconstruction_error > threshold
print(f'Number of anomalies detected: {np.sum(anomalies)}')

Number of anomalies detected: 249
