In [1]:
from pathlib import Path

from natsort import natsorted
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [2]:
dir_path = Path("/home/ss/.cache/huggingface/hub/datasets--ashynf--OpenFWI/snapshots/9ed98af51d841bc7b7a43911021f87df6a730db5")
print([p.stem for p in dir_path.glob("*")])

['Style_A', 'Style_B']


In [3]:
families = {
    "CurveFault_A": [],
    "CurveFault_B": [],
    "CurveVel_A": ["data", "model"],
    "CurveVel_B": ["data", "model"],
    "FlatFault_A": [],
    "FlatFault_B": [],
    "FlatVel_A": ["data", "model"],
    "FlatVel_B": ["data", "model"],
    "Style_A": ["data", "model"],
    "Style_B": ["data", "model"], 
}

df_paths = []
for k, v in families.items():
    df = pd.DataFrame(columns=["x", "y", "families"])
    if len(v) == 0:
        seis_paths = natsorted([p for p in dir_path.joinpath(f"{k}").glob("*") if "seis" in p.stem])
        vel_paths = natsorted([p for p in dir_path.joinpath(f"{k}").glob("*") if "vel" in p.stem])
        assert len(seis_paths) == len(vel_paths)

        df["x"] = seis_paths
        df["y"] = vel_paths
        df["families"] = k
       
    elif len(v) == 2:
        data_paths = natsorted([p for p in dir_path.joinpath(f"{k}/data").glob("*")])
        model_paths = natsorted([p for p in dir_path.joinpath(f"{k}/model").glob("*")])
        assert len(data_paths) == len(model_paths)

        df["x"] = data_paths
        df["y"] = model_paths
        df["families"] = k

    df_paths.append(df)

df_paths = pd.concat(df_paths, ignore_index=True)
display(df_paths)

Unnamed: 0,x,y,families
0,/home/ss/.cache/huggingface/hub/datasets--ashy...,/home/ss/.cache/huggingface/hub/datasets--ashy...,Style_A
1,/home/ss/.cache/huggingface/hub/datasets--ashy...,/home/ss/.cache/huggingface/hub/datasets--ashy...,Style_A
2,/home/ss/.cache/huggingface/hub/datasets--ashy...,/home/ss/.cache/huggingface/hub/datasets--ashy...,Style_A
3,/home/ss/.cache/huggingface/hub/datasets--ashy...,/home/ss/.cache/huggingface/hub/datasets--ashy...,Style_A
4,/home/ss/.cache/huggingface/hub/datasets--ashy...,/home/ss/.cache/huggingface/hub/datasets--ashy...,Style_A
...,...,...,...
63,/home/ss/.cache/huggingface/hub/datasets--ashy...,/home/ss/.cache/huggingface/hub/datasets--ashy...,Style_B
64,/home/ss/.cache/huggingface/hub/datasets--ashy...,/home/ss/.cache/huggingface/hub/datasets--ashy...,Style_B
65,/home/ss/.cache/huggingface/hub/datasets--ashy...,/home/ss/.cache/huggingface/hub/datasets--ashy...,Style_B
66,/home/ss/.cache/huggingface/hub/datasets--ashy...,/home/ss/.cache/huggingface/hub/datasets--ashy...,Style_B


In [4]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
none             30G     0   30G   0% /usr/lib/modules/5.15.167.4-microsoft-standard-WSL2
none             30G  4.0K   30G   1% /mnt/wsl
drivers         930G  725G  206G  78% /usr/lib/wsl/drivers
/dev/sdb        672G  602G   41G  94% /
none             30G   88K   30G   1% /mnt/wslg
none             30G     0   30G   0% /usr/lib/wsl/lib
rootfs           30G  2.4M   30G   1% /init
none             30G     0   30G   0% /run
none             30G     0   30G   0% /run/lock
none             30G     0   30G   0% /run/shm
none             30G     0   30G   0% /run/user
none             30G   76K   30G   1% /mnt/wslg/versions.txt
none             30G   76K   30G   1% /mnt/wslg/doc
C:\             930G  725G  206G  78% /mnt/c
D:\             932G  180M  932G   1% /mnt/d


In [5]:
output_dir = Path("../data")
if not output_dir.is_dir():
    output_dir.mkdir(exist_ok=True)

In [6]:
# df_paths = df_paths.groupby("families").head(1).reset_index(drop=True)
# display(df_paths)

In [7]:
counter = 0
for _, row in tqdm(df_paths.iterrows(), desc="Outer Loop", total=len(df_paths)):
    x_path = row["x"]
    y_path = row["y"]
    families = row["families"]
    x = np.load(x_path)
    y = np.load(y_path)
    assert len(x) == len(y)

    if families in ["CurveFault_A", "CurveFault_B", "FlatFault_A", "FlatFault_B"]:
        suffix_seis = x_path.stem[4:]
        suffix_vel = y_path.stem[3:]
    else:
        suffix_seis = x_path.stem[4:]
        suffix_vel = y_path.stem[5:]

    print(families, suffix_seis, suffix_vel)
    if not suffix_seis == suffix_vel:
        continue

    save_dir = output_dir.joinpath(families)
    if not save_dir.is_dir():
        save_dir.mkdir(exist_ok=True, parents=True)
    for i in tqdm(range(len(x)), desc="Inner Loop", leave=False):
        np.savez_compressed(save_dir.joinpath(f"{x_path.stem}_{y_path.stem}_{i}.npz"), x=x[i], y=y[i])

    # counter += 1
    # if counter == 2:
    #     break

Outer Loop:   0%|          | 0/68 [00:00<?, ?it/s]

Style_A 2 2


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 3 3


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 15 15


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 21 21


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 22 22


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 24 24


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 30 30


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 33 33


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 38 38


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 53 53


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 58 58


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 60 60


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 64 64


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 72 72


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 75 75


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 76 76


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 83 83


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 87 87


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 88 88


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 89 89


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 93 93


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 100 100


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 103 103


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 104 104


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 107 107


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 108 108


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 114 114


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 116 116


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 117 117


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 122 122


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 124 124


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 127 127


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 129 129


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_A 134 134


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 2 2


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 3 3


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 15 15


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 21 21


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 22 22


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 24 24


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 30 30


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 33 33


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 38 38


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 53 53


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 58 58


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 60 60


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 64 64


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 72 72


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 75 75


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 76 76


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 83 83


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 87 87


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 88 88


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 89 89


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 93 93


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 100 100


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 103 103


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 104 104


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 107 107


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 108 108


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 114 114


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 116 116


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 117 117


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 122 122


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 124 124


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 127 127


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 129 129


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

Style_B 134 134


Inner Loop:   0%|          | 0/500 [00:00<?, ?it/s]

In [8]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
none             30G     0   30G   0% /usr/lib/modules/5.15.167.4-microsoft-standard-WSL2
none             30G  4.0K   30G   1% /mnt/wsl
drivers         930G  793G  137G  86% /usr/lib/wsl/drivers
/dev/sdb        672G  642G  5.6M 100% /
none             30G   88K   30G   1% /mnt/wslg
none             30G     0   30G   0% /usr/lib/wsl/lib
rootfs           30G  2.4M   30G   1% /init
none             30G     0   30G   0% /run
none             30G     0   30G   0% /run/lock
none             30G  4.0K   30G   1% /run/shm
none             30G     0   30G   0% /run/user
none             30G   76K   30G   1% /mnt/wslg/versions.txt
none             30G   76K   30G   1% /mnt/wslg/doc
C:\             930G  793G  137G  86% /mnt/c
D:\             932G  180M  932G   1% /mnt/d


In [12]:
families = {
    "CurveFault_A": [],
    "CurveFault_B": [],
    "CurveVel_A": ["data", "model"],
    "CurveVel_B": ["data", "model"],
    "FlatFault_A": [],
    "FlatFault_B": [],
    "FlatVel_A": ["data", "model"],
    "FlatVel_B": ["data", "model"],
    "Style_A": ["data", "model"],
    "Style_B": ["data", "model"], 
}

for k in families.keys():
    p = output_dir.joinpath(k)
    print(k, len([pp for pp in p.glob("*")]))

CurveFault_A 54000
CurveFault_B 54000
CurveVel_A 30000
CurveVel_B 30000
FlatFault_A 54000
FlatFault_B 54000
FlatVel_A 30000
FlatVel_B 30000
Style_A 67000
Style_B 67000


In [11]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
none             30G     0   30G   0% /usr/lib/modules/5.15.167.4-microsoft-standard-WSL2
none             30G  4.0K   30G   1% /mnt/wsl
drivers         930G  794G  137G  86% /usr/lib/wsl/drivers
/dev/sdb        672G  597G   46G  93% /
none             30G   88K   30G   1% /mnt/wslg
none             30G     0   30G   0% /usr/lib/wsl/lib
rootfs           30G  2.4M   30G   1% /init
none             30G     0   30G   0% /run
none             30G     0   30G   0% /run/lock
none             30G  4.0K   30G   1% /run/shm
none             30G     0   30G   0% /run/user
none             30G   76K   30G   1% /mnt/wslg/versions.txt
none             30G   76K   30G   1% /mnt/wslg/doc
C:\             930G  794G  137G  86% /mnt/c
D:\             932G  180M  932G   1% /mnt/d
