In [1]:
!pip install denoising_diffusion_pytorch



In [5]:
import pandas as pd
import torch
import torchvision
from torchvision.transforms import ToTensor
from pandas import Series
import numpy as np
from sklearn import preprocessing
from denoising_diffusion_pytorch import Unet1D, GaussianDiffusion1D, Trainer1D, Dataset1D


In [3]:
#The dataset, collected on the Marconi 100 system, has been published and
#described in the Scientific dataset publication: https://www.nature.com/articles/s41597-023-02174-3
#It is available on Zenodo: https://zenodo.org/records/7541722


#This is rack 1 of the dataset
!wget https://zenodo.org/record/7541722/files/1.tar?download=1 -o rack.tar

^C


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
!tar -xf /content/1.tar?download=1

tar: Unexpected EOF in archive
tar: rmtlseek not stopped at a record boundary
tar: Error is not recoverable: exiting now


In [8]:
#This is node 36, of rack 1
file_name = '36.parquet'

In [9]:
DATA = pd.read_parquet(file_name)
DATA.head(5)

Unnamed: 0,timestamp,ambient_avg,ambient_std,ambient_min,ambient_max,dimm0_temp_avg,dimm0_temp_std,dimm0_temp_min,dimm0_temp_max,dimm10_temp_avg,...,ps1_output_curre_max,ps1_output_volta_avg,ps1_output_volta_std,ps1_output_volta_min,ps1_output_volta_max,total_power_avg,total_power_std,total_power_min,total_power_max,value
0,2020-03-09 12:00:00+00:00,21.636362,0.077138,21.6,21.799999,29.0,0.0,29.0,29.0,30.0,...,22.0,12.399999,0.0,12.4,12.4,400.0,0.0,400.0,400.0,
1,2020-03-09 12:15:00+00:00,21.86,0.128063,21.6,22.0,29.0,0.0,29.0,29.0,30.0,...,21.0,12.399998,0.0,12.4,12.4,400.0,0.0,400.0,400.0,
2,2020-03-09 12:30:00+00:00,21.885714,0.098975,21.799999,22.0,29.0,0.0,29.0,29.0,30.0,...,21.0,12.399999,0.0,12.4,12.4,400.0,0.0,400.0,400.0,
3,2020-03-09 12:45:00+00:00,21.918917,0.098194,21.799999,22.0,29.0,0.0,29.0,29.0,30.0,...,20.0,12.399999,0.0,12.4,12.4,400.0,0.0,400.0,400.0,
4,2020-03-09 13:00:00+00:00,21.875553,0.096967,21.799999,22.0,29.0,0.0,29.0,29.0,30.0,...,21.0,12.400001,0.0,12.4,12.4,400.0,0.0,400.0,400.0,


In [10]:
#The dataset has periods of unavailability of Nagios traces, as described in the dataset paper:  https://www.nature.com/articles/s41597-023-02174-3/figures/3
#We suggest dropping the periods where the labels are largely unavailable and using the dataset either after 1.4.2021 or 1.10.2021.
DATA = DATA[DATA['timestamp'] > '2021-04-01']

### parisa change
DATA['value'] = DATA['value'].replace(2,1)
DATA['value'] = DATA['value'].replace(3,1)
## parisa change

In [11]:
DATA.reset_index(drop=True, inplace = True)
DATA = DATA.fillna(0)
DATA['timestamp'] = pd.to_datetime(DATA['timestamp'])
l = DATA.timestamp.diff() == pd.Timedelta(minutes=15) #time consistency -> measurements should be 15 minutes apart
chunks = []
current_chunk = []
for index, value in enumerate(l):
    current_chunk.append(DATA.iloc[index])
    if not value:
        chunks.append(pd.DataFrame(current_chunk))
        current_chunk = []


if current_chunk:
    chunks.append(pd.DataFrame(current_chunk))

In [12]:
DATA['value'] = DATA['value'].replace(2,1)
DATA['value'] = DATA['value'].replace(3,1)

### parisa change
relevant = [c for c in chunks if len(c) >= 20] #assuming time window of 20
revelent_len = [len(c) for c in chunks if len(c) >= 20]
DATA2 = pd.concat(relevant)
DATA = DATA2
DATA['value'] = DATA['value'].replace(2,1)
DATA['value'] = DATA['value'].replace(3,1)

### parisa change end
##3 parisa change
# relevant = [c for c in chunks if len(c) >= 20] #assuming time window of 20
# DATA2 = pd.concat(relevant)
# DATA = DATA2
# DATA['value'] = DATA['value'].replace(2,1)
# DATA['value'] = DATA['value'].replace(3,1)
# DATA = DATA.drop(columns=['timestamp'])
# DATA = DATA.astype(float)
##3 parisa change end

In [10]:
DATA[DATA["value"]>0]

Unnamed: 0,timestamp,ambient_avg,ambient_std,ambient_min,ambient_max,dimm0_temp_avg,dimm0_temp_std,dimm0_temp_min,dimm0_temp_max,dimm10_temp_avg,...,ps1_output_curre_max,ps1_output_volta_avg,ps1_output_volta_std,ps1_output_volta_min,ps1_output_volta_max,total_power_avg,total_power_std,total_power_min,total_power_max,value
102,2021-04-03 02:00:00+00:00,24.822222,0.075686,24.600000,25.000000,33.000000,0.000000,33.0,33.0,33.244444,...,64.0,12.200001,0.000000,12.2,12.2,688.000000,290.513721,520.0,1560.0,1
103,2021-04-03 02:15:00+00:00,24.955556,0.083148,24.799999,25.000000,33.000000,0.000000,33.0,33.0,33.777778,...,73.0,12.200001,0.000000,12.2,12.2,687.111111,276.972700,520.0,1560.0,1
104,2021-04-03 02:30:00+00:00,24.951112,0.085952,24.799999,25.000000,33.000000,0.000000,33.0,33.0,33.800000,...,77.0,12.200001,0.000000,12.2,12.2,683.111111,286.122136,500.0,1520.0,1
107,2021-04-03 03:15:00+00:00,24.800003,0.000000,24.799999,24.799999,33.000000,0.000000,33.0,33.0,33.466667,...,65.0,12.200001,0.000000,12.2,12.2,599.111111,180.071865,520.0,1440.0,1
108,2021-04-03 03:30:00+00:00,24.737779,0.092589,24.600000,24.799999,33.000000,0.000000,33.0,33.0,33.088889,...,59.0,12.200001,0.000000,12.2,12.2,691.111111,270.096462,520.0,1420.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44702,2022-07-11 16:00:00+00:00,22.835555,0.076465,22.799999,23.000000,32.355556,1.267641,31.0,34.0,33.755556,...,73.0,12.200001,0.000000,12.2,12.2,704.000000,186.313952,540.0,1520.0,1
45628,2022-07-21 07:30:00+00:00,22.808889,0.041216,22.799999,23.000000,31.000000,0.000000,31.0,31.0,31.000000,...,61.0,12.206668,0.024944,12.2,12.3,687.111111,217.466444,520.0,1640.0,1
46793,2022-08-02 10:45:00+00:00,22.226670,0.067987,22.200001,22.400000,29.000000,0.000000,29.0,29.0,30.000000,...,31.0,12.364446,0.076465,12.2,12.4,454.666667,99.098156,400.0,700.0,1
46794,2022-08-02 11:00:00+00:00,22.217780,0.056916,22.200001,22.400000,29.000000,0.000000,29.0,29.0,30.000000,...,21.0,12.400001,0.000000,12.4,12.4,405.777778,9.064924,400.0,420.0,1


In [13]:
DATA3= DATA
DATA=DATA.drop(columns=['timestamp'])
original_dtypes = DATA.dtypes
DATA = DATA.astype(float)

In [14]:
#### add label of next row
DATA['next_value'] = DATA['value'].shift(-1).fillna(0)

In [15]:
### use for delete value column
# DATA=DATA.drop(columns=['value'])
original_dtypes = DATA.dtypes

In [16]:
scaler = preprocessing.MinMaxScaler()  ### use further
names = DATA.columns  ##
d = scaler.fit_transform(DATA)
DATA = pd.DataFrame(d, columns=names)
# ### parisa change                                --- here to change more or add time stamp in train

#timestamp_scaler = preprocessing.MinMaxScaler()
#DATA['timestamp'] = timestamp_scaler.fit_transform(DATA3['timestamp'].astype('int64').values.reshape(-1, 1))
#### parisa change end

In [17]:
# train_data = DATA[:int(DATA.shape[0]*0.8)]
# test_data = DATA[int(DATA.shape[0]*0.8):]

################# parisa change
train_data = DATA[:]

############### end parisa change

In [16]:
train_data['value'].value_counts()

Unnamed: 0_level_0,count
value,Unnamed: 1_level_1
0.0,50998
1.0,1311


In [18]:
train_data_tensor = torch.tensor(train_data.to_numpy(), dtype=torch.float).to(device)

In [19]:
from torch.utils.data import DataLoader

In [20]:
train_window = 20 ## 20

def create_data_seq(data,tw):
    seq = []
    for i in range(len(data)-tw):
        x_seq = data[i:i+tw],
        y_seq = data[i+tw:i+tw+1]  ###در ترین میاد 0 تا 20 رو میده و میگه 21 رو حدس بزن
        seq.append((x_seq,y_seq))
    return seq


In [21]:
train_seq = create_data_seq(train_data_tensor,train_window)

In [None]:
X[0].size()


In [22]:
train_loader = DataLoader(train_seq, batch_size=1, shuffle=True)

X,y  = next(iter(train_loader))
# print(X)


In [23]:
from denoising_diffusion_pytorch import Unet, GaussianDiffusion, Trainer

In [62]:
import torch

# Assuming `train_seq` is your list of tuples with the desired data
# Extract the first element from each tuple
first_elements = [t[0][0] for t in train_seq]

# Convert the list to a tensor
tensor = torch.stack(first_elements)
total_elements = tensor.numel()  # Get total number of elements in the tensor
print(total_elements)  # Debug: see how many elements are in the tensor

# Compute the appropriate shape that can hold these elements
# In case the number of elements is divisible by 354:
new_shape = (total_elements // 354, 354)
reshaped_tensor = tensor.view(new_shape).to(device)
# Reshape the tensor to the desired shape (52289, 354)
# reshaped_tensor = tensor.view(52289, 354).to(device)

# Alternatively, if your tensor has an extra dimension you can remove:
reshaped_tensor = reshaped_tensor.squeeze(-1)  # This removes the last dimension if it's size 1


370206120


In [63]:
reshaped_tensor

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.5513, 0.0099, 0.5802,  ..., 0.4792, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.5282, 0.0606, 0.5472,  ..., 0.7396, 0.0000, 0.0000],
        [0.5176, 0.0531, 0.5425,  ..., 0.7292, 0.0000, 0.0000],
        [0.5134, 0.0882, 0.5330,  ..., 0.3125, 0.0000, 0.0000]],
       device='cuda:0')

In [31]:

model = Unet1D(
    dim = 5,
    dim_mults = (1, 2), ### moshkel run azine az cpu estefade mikone
    channels = 354
).to(device)

diffusion = GaussianDiffusion1D(
    model,
    seq_length =1 ,
    timesteps = 1,
    objective = 'pred_v'
).to(device)


# training_seq = torch.rand(52289, 354, 1) # features are normalized from 0 to 1

# loss = diffusion(training_seq)
# loss.backward()

# Or using trainer
# dataset = Dataset1D(reshaped_tensor).to(device)  # this is just an example, but you can formulate your own Dataset and pass it into the `Trainer1D` below

trainer = Trainer1D(
    diffusion,
    dataset = reshaped_tensor,
    train_batch_size = 1,
    train_lr = 8e-5,
    train_num_steps = 10,         # total training steps
    gradient_accumulate_every = 2,    # gradient accumulation steps
    ema_decay = 0.995,                # exponential moving average decay
    amp = True,                       # turn on mixed precision
)

trainer.train()

# after a lot of training

sampled_seq = diffusion.sample(batch_size = 1)
sampled_seq.shape # (4, 354, 1)

# for d in train_loader:
#         #print(type(d))
#       x = d[0][0].to(device)
#       # print(x.size())
#         # print(x)
#        #y = d[1].to(device)
#       trainer = Trainer1D(
#                 diffusion,
#                 dataset = x,
#                 train_batch_size = 2,
#                 train_lr = 8e-5,
#                 train_num_steps = 10,         # total training steps
#                 gradient_accumulate_every = 2,    # gradient accumulation steps
#                 ema_decay = 0.995,                # exponential moving average decay
#                 amp = True, )                      # turn on mixed precision

#       trainer.train()
#       sampled_seq = diffusion.sample(batch_size = 1)
#       sampled_seq.shape # (1, 32, 128)


# trainer.train()

# # after a lot of training

# sampled_seq = diffusion.sample(batch_size = 4)
# sampled_seq.shape # (4, 32, 128)


  0%|          | 0/10 [00:00<?, ?it/s]

ZeroDivisionError: float division by zero

In [70]:
model = Unet1D(
    dim = 354,
    dim_mults = (1, 2), ### moshkel run azine az cpu estefade mikone
    channels = 1
).to(device)




In [71]:
diffusion = GaussianDiffusion1D(
    model,
    seq_length =1 ,
    timesteps = 2,
    objective = 'pred_v'
).to(device)


# training_seq = torch.rand(52289, 354, 1) # features are normalized from 0 to 1

# loss = diffusion(training_seq)
# loss.backward()

# Or using trainer
# dataset = Dataset1D(reshaped_tensor).to(device)  # this is just an example, but you can formulate your own Dataset and pass it into the `Trainer1D` below



In [72]:
trainer = Trainer1D(
    diffusion,
    dataset = reshaped_tensor,
    train_batch_size = 4,
    train_lr = 0.0001, #8e-5,
    train_num_steps = 10,         # total training steps
    gradient_accumulate_every = 2,    # gradient accumulation steps
    ema_decay = 0.995,                # exponential moving average decay
    amp = True,                       # turn on mixed precision
)

# trainer.train()

In [73]:


# after a lot of training
for batch_data in reshaped_tensor:
    batch_data = batch_data.to(device)
    # print(batch_data)
    batch_data = batch_data.unsqueeze(-1).unsqueeze(-1) # Move batch data to GPU
    loss = diffusion(batch_data)
    loss.backward()
    trainer.step()  # Perform a training step

sampled_seq = diffusion.sample(batch_size = 1)
sampled_seq.shape # (4, 354, 1)

RuntimeError: Calculated padded input size per channel: (3). Kernel size: (4). Kernel size can't be greater than actual input size

In [29]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

True
1
Tesla T4


In [None]:
training_seq.size()


torch.Size([64, 32, 128])

In [None]:
print(train_seq[0][0][0].size())
print(len(train_seq))
print(len(train_seq[0]))


torch.Size([20, 353])
52289
2


In [None]:
train_data_tensor.size()

torch.Size([52309, 354])

In [None]:
from torch.utils.data import DataLoader, Dataset

In [None]:
total_elements = 18465077
num_features = 353

# Determine seq_length based on total elements
valid_shapes = []

# Loop through potential seq_length values
for seq_length in range(1, 100):  # Adjust the range as needed
    num_samples = total_elements // (num_features * seq_length)
    if num_samples * num_features * seq_length == total_elements:
        valid_shapes.append((num_samples, seq_length))


In [None]:
print("Valid shapes (num_samples, seq_length):", valid_shapes)


Valid shapes (num_samples, seq_length): [(52309, 1), (3077, 17)]


In [None]:
### save it on google drive
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/MasterProject/'+'create_data_my_model_add_label_2_lr_0.1'+'.parquet'
df_combined.to_parquet(file_path, index=False)

Mounted at /content/drive
