In [11]:
import torch
from torchvision import datasets
from torchvision import transforms
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset
import csv

In [12]:
class AutoEncoder(torch.nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(18, 9),
            torch.nn.ReLU(),
            torch.nn.Linear(9, 6),
            torch.nn.ReLU(),
            torch.nn.Linear(6, 3),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(3, 6),
            torch.nn.ReLU(),
            torch.nn.Linear(6, 9),
            torch.nn.ReLU(),
            torch.nn.Linear(9, 18),
            torch.nn.ReLU(),
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [None]:
with open('background_data.txt', 'r') as f:
    data = f.read()
parsed_data = []
lines = data.split('\n')
print(len(lines))
i = 0
while i < len(lines):
    if i%10000 ==0:
        print("working: {}".format(i))
    line = lines[i].strip()
    i += 1  # Increment i here to ensure it always moves to the next line

    if line:
        if line.isdigit():  # Check if the line contains only digits (number of jets)
            num_jets = int(line)
            jets_data = []

            for _ in range(num_jets):
                if i < len(lines):
                    jet_values = [float(val) for val in lines[i].split()]
                    jets_data.extend(jet_values)
                    i += 1  # Increment i for each jet line

            # Padding if fewer than 6 jets
            while len(jets_data) < 6 * 3:
                jets_data.append(0.0)
                print("padding {}".format(line))

            # Truncate if more than 6 jets (optional, based on your requirement)
            jets_data = jets_data[:18]
            parsed_data.append(jets_data)

# Display the parsed data
print(len(parsed_data))


# Path for the new CSV file
output_file_path = 'parsed_data.csv'

# Writing the parsed data to a CSV file
with open(output_file_path, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(parsed_data)


In [5]:
file_path = 'parsed_data.csv'  # Replace with your CSV file path

line_count = 0
with open(file_path, 'r') as file:
    for line in file:
        line_count += 1

print(f"Number of lines in the CSV file: {line_count}")


Number of lines in the CSV file: 350181


In [3]:
print(torch.cuda.is_available())

True


In [13]:
csv_file_path = 'parsed_data.csv'
parsed_data = []

with open(csv_file_path, 'r') as csvfile:
    csv_reader = csv.reader(csvfile)
    
    for row in csv_reader:
        data_point = [float(value) for value in row]
        parsed_data.append(data_point)

print(len(parsed_data))

350181


In [14]:
from sklearn.model_selection import train_test_split

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('GPU is available')
else:
    device = torch.device('cpu')
    print('GPU not available, CPU used')

tensor_data = torch.tensor(parsed_data, dtype=torch.float)
train_data, val_data = train_test_split(tensor_data, test_size=0.2, random_state=42)
train_dataset = TensorDataset(train_data)
val_dataset = TensorDataset(val_data)
batch_size = 1024  # adjust later
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
model = AutoEncoder()
model.to(device)
criterion = torch.nn.MSELoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) 


GPU is available


In [16]:
num_epochs = 20  # Number of training epochs
for epoch in range(num_epochs):
    # Training Phase
    model.train()  # Set the model to training mode
    train_loss = 0.0
    for data in train_dataloader:
        inputs = data[0].to(device)  # Assuming data is a tuple of (input, target)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * inputs.size(0)

    # Calculate average training loss
    train_loss /= len(train_dataloader.dataset)

    # Validation Phase
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    with torch.no_grad():  # Turn off gradients for validation
        for data in val_dataloader:
            inputs = data[0].to(device)  # Assuming data is a tuple of (input, target)

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, inputs)

            val_loss += loss.item() * inputs.size(0)

    # Calculate average validation loss
    val_loss /= len(val_dataloader.dataset)

    # Print epoch summary
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')


Epoch 1/20, Training Loss: 2450.9058, Validation Loss: 2166.7591
Epoch 2/20, Training Loss: 2149.8009, Validation Loss: 2157.2379
Epoch 3/20, Training Loss: 2132.9645, Validation Loss: 2119.9982
Epoch 4/20, Training Loss: 1840.0275, Validation Loss: 1055.1729
Epoch 5/20, Training Loss: 988.4210, Validation Loss: 957.6894
Epoch 6/20, Training Loss: 902.1785, Validation Loss: 894.1413
Epoch 7/20, Training Loss: 889.9994, Validation Loss: 893.3907
Epoch 8/20, Training Loss: 889.3732, Validation Loss: 892.9784
Epoch 9/20, Training Loss: 889.1023, Validation Loss: 892.3953
Epoch 10/20, Training Loss: 888.9682, Validation Loss: 892.4054
Epoch 11/20, Training Loss: 888.9301, Validation Loss: 892.4731
Epoch 12/20, Training Loss: 888.8929, Validation Loss: 892.1134
Epoch 13/20, Training Loss: 888.7906, Validation Loss: 892.2294
Epoch 14/20, Training Loss: 888.8641, Validation Loss: 892.4237
Epoch 15/20, Training Loss: 888.7865, Validation Loss: 892.2262
Epoch 16/20, Training Loss: 888.7713, Val

In [17]:
model_save_path = "autoencoder_model.pth"
torch.save(model.state_dict(), model_save_path)

In [18]:
model = AutoEncoder()
model.load_state_dict(torch.load(model_save_path))
model.eval()

AutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=18, out_features=9, bias=True)
    (1): ReLU()
    (2): Linear(in_features=9, out_features=6, bias=True)
    (3): ReLU()
    (4): Linear(in_features=6, out_features=3, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=3, out_features=6, bias=True)
    (1): ReLU()
    (2): Linear(in_features=6, out_features=9, bias=True)
    (3): ReLU()
    (4): Linear(in_features=9, out_features=18, bias=True)
    (5): ReLU()
  )
)

## Signal Dataset Evaluation

In [19]:
with open('signal_data.txt', 'r') as f:
    data = f.read()
parsed_signal_data = []
lines = data.split('\n')
print(len(lines))
i = 0
while i < len(lines):
    if i%10000 ==0:
        print("working: {}".format(i))
    line = lines[i].strip()
    i += 1  # Increment i here to ensure it always moves to the next line

    if line:
        if line.isdigit():  # Check if the line contains only digits (number of jets)
            num_jets = int(line)
            jets_data = []

            for _ in range(num_jets):
                if i < len(lines):
                    jet_values = [float(val) for val in lines[i].split()]
                    jets_data.extend(jet_values)
                    i += 1  # Increment i for each jet line

            # Padding if fewer than 6 jets
            while len(jets_data) < 6 * 3:
                jets_data.append(0.0)
                print("padding {}".format(line))

            # Truncate if more than 6 jets (optional, based on your requirement)
            jets_data = jets_data[:18]
            parsed_signal_data.append(jets_data)

# Display the parsed data
print(len(parsed_signal_data))


# Path for the new CSV file
output_file_path = 'parsed_signal_data.csv'

# Writing the parsed data to a CSV file
with open(output_file_path, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(parsed_signal_data)

404071
working: 0
padding 4
padding 4
padding 4
padding 4
padding 4
padding 4
padding 2
padding 2
padding 2
padding 2
padding 2
padding 2
padding 2
padding 2
padding 2
padding 2
padding 2
padding 2
padding 4
padding 4
padding 4
padding 4
padding 4
padding 4
padding 5
padding 5
padding 5
padding 3
padding 3
padding 3
padding 3
padding 3
padding 3
padding 3
padding 3
padding 3
padding 4
padding 4
padding 4
padding 4
padding 4
padding 4
padding 5
padding 5
padding 5
padding 5
padding 5
padding 5
padding 3
padding 3
padding 3
padding 3
padding 3
padding 3
padding 3
padding 3
padding 3
padding 4
padding 4
padding 4
padding 4
padding 4
padding 4
padding 5
padding 5
padding 5
padding 5
padding 5
padding 5
padding 4
padding 4
padding 4
padding 4
padding 4
padding 4
padding 5
padding 5
padding 5
padding 5
padding 5
padding 5
padding 4
padding 4
padding 4
padding 4
padding 4
padding 4
padding 2
padding 2
padding 2
padding 2
padding 2
padding 2
padding 2
padding 2
padding 2
padding 2
padding 2
pa

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('GPU is available')
else:
    device = torch.device('cpu')
    print('GPU not available, CPU used')

tensor_data = torch.tensor(parsed_signal_data, dtype=torch.float)
signal_dataset = TensorDataset(tensor_data)
signal_dataloader = DataLoader(signal_dataset, batch_size=1024, shuffle=False)

model = AutoEncoder()  # Replace with your actual model class name
model.load_state_dict(torch.load('autoencoder_model.pth', map_location=device))
model.to(device)
model.eval()



CPU 10900k, 256 batch-size (DISCARDED)
Epoch 1/20, Training Loss: 34560.8617, Validation Loss: 31670.4607
Epoch 2/20, Training Loss: 31635.0025, Validation Loss: 31524.3709
Epoch 3/20, Training Loss: 31559.0802, Validation Loss: 31507.3812
Epoch 4/20, Training Loss: 31554.0866, Validation Loss: 31505.4354
Epoch 5/20, Training Loss: 31551.4262, Validation Loss: 31501.6844
Epoch 6/20, Training Loss: 31547.6684, Validation Loss: 31497.3715
Epoch 7/20, Training Loss: 31542.6937, Validation Loss: 31491.3277
Epoch 8/20, Training Loss: 31535.9983, Validation Loss: 31484.2964
Epoch 9/20, Training Loss: 31520.5337, Validation Loss: 31449.6059
Epoch 10/20, Training Loss: 31408.5225, Validation Loss: 31190.0022
Epoch 11/20, Training Loss: 30967.4737, Validation Loss: 30704.1642
Epoch 12/20, Training Loss: 30714.5026, Validation Loss: 30642.9612
Epoch 13/20, Training Loss: 30680.3009, Validation Loss: 30601.1572
Epoch 14/20, Training Loss: 30647.4355, Validation Loss: 30582.1140
Epoch 15/20, Training Loss: 30636.0626, Validation Loss: 30576.0879
Epoch 16/20, Training Loss: 30632.0933, Validation Loss: 30574.2543
Epoch 17/20, Training Loss: 30629.5754, Validation Loss: 30570.8025
Epoch 18/20, Training Loss: 30418.0364, Validation Loss: 30306.1500
Epoch 19/20, Training Loss: 30357.0964, Validation Loss: 30298.2949
Epoch 20/20, Training Loss: 30351.6697, Validation Loss: 30294.0157

GPU 3090, 1024 batch-size (USED)
Epoch 1/20, Training Loss: 2450.9058, Validation Loss: 2166.7591
Epoch 2/20, Training Loss: 2149.8009, Validation Loss: 2157.2379
Epoch 3/20, Training Loss: 2132.9645, Validation Loss: 2119.9982
Epoch 4/20, Training Loss: 1840.0275, Validation Loss: 1055.1729
Epoch 5/20, Training Loss: 988.4210, Validation Loss: 957.6894
Epoch 6/20, Training Loss: 902.1785, Validation Loss: 894.1413
Epoch 7/20, Training Loss: 889.9994, Validation Loss: 893.3907
Epoch 8/20, Training Loss: 889.3732, Validation Loss: 892.9784
Epoch 9/20, Training Loss: 889.1023, Validation Loss: 892.3953
Epoch 10/20, Training Loss: 888.9682, Validation Loss: 892.4054
Epoch 11/20, Training Loss: 888.9301, Validation Loss: 892.4731
Epoch 12/20, Training Loss: 888.8929, Validation Loss: 892.1134
Epoch 13/20, Training Loss: 888.7906, Validation Loss: 892.2294
Epoch 14/20, Training Loss: 888.8641, Validation Loss: 892.4237
Epoch 15/20, Training Loss: 888.7865, Validation Loss: 892.2262
Epoch 16/20, Training Loss: 888.7713, Validation Loss: 892.1217
Epoch 17/20, Training Loss: 888.6293, Validation Loss: 891.9074
Epoch 18/20, Training Loss: 888.6581, Validation Loss: 892.2521
Epoch 19/20, Training Loss: 888.6596, Validation Loss: 892.8734
Epoch 20/20, Training Loss: 888.5858, Validation Loss: 891.8710
