In [1]:
import torch as th
import pytest
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
import os
import glob

In [3]:
class SeismicDataset(Dataset):

    def __init__(self, signal_folder_path: str, noise_folder_path: str, randomized = False):

        """
        Args:
            signal_folder_path (str): Path to earthquake signal folder containing .npz files.
            noise_folder_path (str): Path to noise folder containing .npz files.
        """

        self.signal_folder_path = signal_folder_path
        self.noise_folder_path = noise_folder_path
        self.eq_signal_files = glob.glob(f'{signal_folder_path}/**/*.npz', recursive=True)
        self.noise_files = glob.glob(f'{noise_folder_path}/**/*.npz', recursive=True)
        self.randomized = randomized


    def __len__(self) -> int:
        return len(self.eq_signal_files)

    def __getitem__(self, idx) -> tuple[th.Tensor, int, str]:

        eq_path = self.eq_signal_files[idx]
        eq = np.load(eq_path, allow_pickle=True)
        eq_name = (os.path.splitext(os.path.basename(eq_path)))[0]

        noise_idx = np.random.randint(0, len(self.noise_files))
        noise_path = self.noise_files[noise_idx]
        noise = np.load(noise_path, allow_pickle=True)
        noise_name = (os.path.splitext(os.path.basename(noise_path)))[0]

        eq_start = 0
        noise_start = 0
        if self.randomized:
            eq_start = np.random.randint(low = 0, high = 6000)
            noise_start = np.random.randint(low = 0, high = 12000)

        Z_eq = eq['earthquake_waveform_Z'][eq_start:eq_start+6000]
        N_eq = eq['earthquake_waveform_N'][eq_start:eq_start+6000]
        E_eq = eq['earthquake_waveform_E'][eq_start:eq_start+6000]
        event = np.stack([Z_eq, N_eq, E_eq], axis=0)
        eq_tensor = th.from_numpy(event)

        Z_noise = noise['noise_waveform_Z'][noise_start:noise_start+6000]
        N_noise = noise['noise_waveform_N'][noise_start:noise_start+6000]
        E_noise = noise['noise_waveform_E'][noise_start:noise_start+6000]
        noise_stacked = np.stack([Z_noise, N_noise, E_noise], axis=0)
        noise_tensor = th.from_numpy(noise_stacked)

        # tensor_normalized = eq_tensor / eqtensor.abs().max()

        p_wave_start = 6000 - eq_start
        # , p_wave_start, eq_name, noise_name

        print(eq_tensor.shape)
        print(noise_tensor.shape)

        noisy_eq = eq_tensor + noise_tensor

        return noisy_eq, eq_tensor


In [5]:
import ipytest
ipytest.autoconfig()

def test_dataset_length():
    signal_folder = "C:/Users/cleme/ETH/Master/DataLab/dsl-as24-challenge-3/data/signal/train"
    noise_folder = "C:/Users/cleme/ETH/Master/DataLab/dsl-as24-challenge-3/data/noise/train"
    dataset = SeismicDataset(signal_folder, noise_folder)
    assert len(dataset) == 20230 

def test_output_shape():
    signal_folder = "C:/Users/cleme/ETH/Master/DataLab/dsl-as24-challenge-3/data/signal/train"
    noise_folder = "C:/Users/cleme/ETH/Master/DataLab/dsl-as24-challenge-3/data/noise/train"
    dataset = SeismicDataset(signal_folder, noise_folder)
    dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    next_tensor, noise_tensor = next(iter(dataloader))
    print(next_tensor.shape)
    print(noise_tensor.shape)
    assert next_tensor.shape == (1,3,6000)


ipytest.run('-vv')

platform win32 -- Python 3.12.3, pytest-8.3.3, pluggy-1.5.0 -- c:\Users\cleme\miniconda3\envs\dsl\python.exe
cachedir: .pytest_cache
rootdir: c:\Users\cleme\ETH\Master\DataLab\dsl-as24-challenge-3
plugins: anyio-4.2.0, typeguard-4.3.0
[1mcollecting ... [0mcollected 2 items

t_e216c054b0144360a2e83cb3a6ddf972.py::test_dataset_length [32mPASSED[0m[32m                            [ 50%][0m
t_e216c054b0144360a2e83cb3a6ddf972.py::test_output_shape [32mPASSED[0m[32m                              [100%][0m



<ExitCode.OK: 0>