In [191]:
import torch
import os
import numpy as np
from torch.utils.data.dataset import random_split
from torch_geometric.loader import DataLoader

In [190]:
torch.manual_seed(2024)

def load_all_graphs(folder_path):
    # Get a list of all .pt files in the folder
    file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.pth')]

    # Load all the graphs from the files
    all_graphs = []
    for file_path in file_paths:
        graph_dict = torch.load(file_path)
        all_graphs.append(graph_dict)

    return all_graphs

all_graphs = load_all_graphs('./data/torch_processed/')

# Assuming all_graphs is a list of Data objects
for graph in all_graphs:
    HS_E_red = torch.tensor(graph['HS_E_red'].reshape(-1, 1))
    LS_E_red = torch.tensor(graph['LS_E_red'].reshape(-1, 1))
    graph.y = torch.cat((HS_E_red, LS_E_red), dim=1)

In [192]:
data_loader = DataLoader(all_graphs, shuffle=True)
# Calculate the number of samples for each dataset
num_graphs = len(all_graphs)
num_train = int(num_graphs * 0.7)
num_val = int(num_graphs * 0.15)
num_test = num_graphs - num_train - num_val  # Ensure the remaining data goes to the test set

# Split the dataset
train_data, val_data, test_data = random_split(all_graphs, [num_train, num_val, num_test])

# Create DataLoaders for each dataset
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
val_loader = DataLoader(val_data, batch_size=16, shuffle=False)
test_loader = DataLoader(test_data, batch_size=16, shuffle=False)

In [186]:
print(f"Number of samples in the training set: {len(train_loader.dataset)}")
print(f"Number of samples in the validation set: {len(val_loader.dataset)}")
print(f"Number of samples in the test set: {len(test_loader.dataset)}")
print(f"Total number of samples: {len(train_loader.dataset) + len(val_loader.dataset) + len(test_loader.dataset)}")

Number of samples in the training set: 445
Number of samples in the validation set: 95
Number of samples in the test set: 97
Total number of samples: 637


In [199]:
# Function to calculate mean and std of 'y'
def calculate_mean_std(loader):
    y_values = []
    for batch in loader:
        y_values.append(batch.y)
    y_values = torch.cat(y_values, dim=0)
    print(y_values.shape)
    mean = y_values.mean(dim=0)
    std = y_values.std(dim=0)
    return mean, std

In [200]:
mean, std = calculate_mean_std(train_loader)
print(mean)
print(std)

torch.Size([445, 2])
tensor([0.9837, 0.5879], dtype=torch.float64)
tensor([0.4854, 0.4394], dtype=torch.float64)


In [189]:
for batch in train_loader:
    print(batch.y.shape)
    break

torch.Size([16, 2])


AttributeError: 'Subset' object has no attribute 'data'