In [1]:
from pathlib import Path
import random

from huggingface_hub import list_repo_files, hf_hub_download
import numpy as np
from natsort import natsorted
import pandas as pd
from tqdm.auto import tqdm

### Fix Seeds

In [2]:
SEED = 42

random.seed(SEED)
np.random.seed(SEED)

### Load File Paths

In [3]:
files = list_repo_files(repo_id="ashynf/OpenFWI", repo_type="dataset")
files.remove(".gitattributes")
files.remove("README.md")
print(files)
print(len(files))

['CurveFault_A/seis2_1_0.npy', 'CurveFault_A/seis2_1_1.npy', 'CurveFault_A/seis2_1_10.npy', 'CurveFault_A/seis2_1_11.npy', 'CurveFault_A/seis2_1_12.npy', 'CurveFault_A/seis2_1_13.npy', 'CurveFault_A/seis2_1_14.npy', 'CurveFault_A/seis2_1_15.npy', 'CurveFault_A/seis2_1_16.npy', 'CurveFault_A/seis2_1_17.npy', 'CurveFault_A/seis2_1_18.npy', 'CurveFault_A/seis2_1_19.npy', 'CurveFault_A/seis2_1_2.npy', 'CurveFault_A/seis2_1_20.npy', 'CurveFault_A/seis2_1_21.npy', 'CurveFault_A/seis2_1_22.npy', 'CurveFault_A/seis2_1_23.npy', 'CurveFault_A/seis2_1_24.npy', 'CurveFault_A/seis2_1_25.npy', 'CurveFault_A/seis2_1_26.npy', 'CurveFault_A/seis2_1_27.npy', 'CurveFault_A/seis2_1_28.npy', 'CurveFault_A/seis2_1_29.npy', 'CurveFault_A/seis2_1_3.npy', 'CurveFault_A/seis2_1_30.npy', 'CurveFault_A/seis2_1_31.npy', 'CurveFault_A/seis2_1_32.npy', 'CurveFault_A/seis2_1_33.npy', 'CurveFault_A/seis2_1_34.npy', 'CurveFault_A/seis2_1_35.npy', 'CurveFault_A/seis2_1_4.npy', 'CurveFault_A/seis2_1_5.npy', 'CurveFault_A

### Make a Output Directory

In [4]:
output_dir = Path("../data")
if not output_dir.is_dir():
    output_dir.mkdir(exist_ok=True)

In [5]:
exists = {
    "CurveFault_A": set(),
    "CurveFault_B": set(),
    "CurveVel_A": set(),
    "CurveVel_B": set(),
    "FlatFault_A": set(),
    "FlatFault_B": set(),
    "FlatVel_A": set(),
    "FlatVel_B": set(),
    "Style_A": set(),
    "Style_B": set(),
}

for k, v in exists.items():
    for p in output_dir.joinpath(k).glob("*.npz"):
        exists[k].add("_".join(p.stem.split("_")[:-1]))
display(exists)

for k, v in exists.items():
    print(k, len(v))

{'CurveFault_A': {'seis2_1_0_vel2_1_0',
  'seis2_1_10_vel2_1_10',
  'seis2_1_11_vel2_1_11',
  'seis2_1_12_vel2_1_12',
  'seis2_1_13_vel2_1_13',
  'seis2_1_14_vel2_1_14',
  'seis2_1_15_vel2_1_15',
  'seis2_1_16_vel2_1_16',
  'seis2_1_17_vel2_1_17',
  'seis2_1_18_vel2_1_18',
  'seis2_1_19_vel2_1_19',
  'seis2_1_1_vel2_1_1',
  'seis2_1_20_vel2_1_20',
  'seis2_1_21_vel2_1_21',
  'seis2_1_22_vel2_1_22',
  'seis2_1_23_vel2_1_23',
  'seis2_1_24_vel2_1_24',
  'seis2_1_25_vel2_1_25',
  'seis2_1_26_vel2_1_26',
  'seis2_1_27_vel2_1_27',
  'seis2_1_28_vel2_1_28',
  'seis2_1_29_vel2_1_29',
  'seis2_1_2_vel2_1_2',
  'seis2_1_30_vel2_1_30',
  'seis2_1_31_vel2_1_31',
  'seis2_1_32_vel2_1_32',
  'seis2_1_33_vel2_1_33',
  'seis2_1_34_vel2_1_34',
  'seis2_1_35_vel2_1_35',
  'seis2_1_3_vel2_1_3',
  'seis2_1_4_vel2_1_4',
  'seis2_1_5_vel2_1_5',
  'seis2_1_6_vel2_1_6',
  'seis2_1_7_vel2_1_7',
  'seis2_1_8_vel2_1_8',
  'seis2_1_9_vel2_1_9',
  'seis3_1_0_vel3_1_0',
  'seis3_1_10_vel3_1_10',
  'seis3_1_11_vel3

CurveFault_A 108
CurveFault_B 108
CurveVel_A 60
CurveVel_B 60
FlatFault_A 108
FlatFault_B 108
FlatVel_A 60
FlatVel_B 60
Style_A 100
Style_B 100


### Load and Save Data

In [6]:
def check_already_exist(x: pd.Series):
    path = x.iloc[0].split("/")[-1][:-4] + "_" + x.iloc[1].split("/")[-1][:-4]
    if path in exists[k]:
        return True
    else:
        return False

In [7]:
families = {
    # "CurveFault_A": [],
    # "CurveFault_B": [],
    # "CurveVel_A": ["data", "model"],
    # "CurveVel_B": ["data", "model"],
    # "FlatFault_A": [],
    # "FlatFault_B": [],
    # "FlatVel_A": ["data", "model"],
    # "FlatVel_B": ["data", "model"],
    "Style_A": ["data", "model"],
    "Style_B": ["data", "model"],
}

for k, v in tqdm(families.items(), desc="Outer loop", total=len(families)):
    if len(v) == 0:
        columns = ["seis", "vel"]
    elif len(v) == 2:
        columns = v
    df = pd.DataFrame(columns=columns)
    for col in columns:
        df[col] = natsorted([f for f in files if k+"/"+col in f])
    df = df.sample(frac=1.0, replace=False, random_state=SEED)
    df["exist"] = df.apply(check_already_exist, axis=1)
    df = df.query("exist == False")
    display(k, len(df), df.head(), "="*100)
    for _, row in tqdm(df.iterrows(), desc="Inner loop", leave=False, total=len(df)):
        path = row.iloc[0].split("/")[-1][:-4] + "_" + row.iloc[1].split("/")[-1][:-4]
        if not path in exists[k]:
            for col in columns:
                print(row[col])
                hf_hub_download(repo_id="ashynf/OpenFWI", repo_type="dataset", filename=row[col])

Outer loop:   0%|          | 0/2 [00:00<?, ?it/s]

'Style_A'

34

Unnamed: 0,data,model,exist
88,Style_A/data/data89.npy,Style_A/model/model89.npy,False
115,Style_A/data/data116.npy,Style_A/model/model116.npy,False
57,Style_A/data/data58.npy,Style_A/model/model58.npy,False
75,Style_A/data/data76.npy,Style_A/model/model76.npy,False
32,Style_A/data/data33.npy,Style_A/model/model33.npy,False




Inner loop:   0%|          | 0/34 [00:00<?, ?it/s]

Style_A/data/data89.npy


data89.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model89.npy


model89.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data116.npy


data116.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model116.npy


model116.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data58.npy


data58.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model58.npy


model58.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data76.npy


data76.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model76.npy


model76.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data33.npy


data33.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model33.npy


model33.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data129.npy


data129.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model129.npy


model129.npy:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Style_A/data/data60.npy


data60.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model60.npy


model60.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data64.npy


data64.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model64.npy


model64.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data114.npy


data114.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model114.npy


model114.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data38.npy


data38.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model38.npy


model38.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data30.npy


data30.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model30.npy


model30.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data108.npy


data108.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model108.npy


model108.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data127.npy


data127.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model127.npy


model127.npy:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Style_A/data/data2.npy


data2.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model2.npy


model2.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data53.npy


data53.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model53.npy


model53.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data22.npy


data22.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model22.npy


model22.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data3.npy


data3.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model3.npy


model3.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data24.npy


data24.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model24.npy


model24.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data104.npy


data104.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model104.npy


model104.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data100.npy


data100.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model100.npy


model100.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data117.npy


data117.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model117.npy


model117.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data88.npy


data88.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model88.npy


model88.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data124.npy


data124.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model124.npy


model124.npy:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Style_A/data/data75.npy


data75.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model75.npy


model75.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data87.npy


data87.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model87.npy


model87.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data83.npy


data83.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model83.npy


model83.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data122.npy


data122.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model122.npy


model122.npy:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Style_A/data/data134.npy


data134.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model134.npy


model134.npy:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Style_A/data/data21.npy


data21.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model21.npy


model21.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data72.npy


data72.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model72.npy


model72.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data107.npy


data107.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model107.npy


model107.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data15.npy


data15.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model15.npy


model15.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data93.npy


data93.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model93.npy


model93.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_A/data/data103.npy


data103.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_A/model/model103.npy


model103.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

'Style_B'

34

Unnamed: 0,data,model,exist
88,Style_B/data/data89.npy,Style_B/model/model89.npy,False
115,Style_B/data/data116.npy,Style_B/model/model116.npy,False
57,Style_B/data/data58.npy,Style_B/model/model58.npy,False
75,Style_B/data/data76.npy,Style_B/model/model76.npy,False
32,Style_B/data/data33.npy,Style_B/model/model33.npy,False




Inner loop:   0%|          | 0/34 [00:00<?, ?it/s]

Style_B/data/data89.npy


data89.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model89.npy


model89.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data116.npy


data116.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model116.npy


model116.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data58.npy


data58.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model58.npy


model58.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data76.npy


data76.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model76.npy


model76.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data33.npy


data33.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model33.npy


model33.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data129.npy


data129.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model129.npy


model129.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data60.npy


data60.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model60.npy


model60.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data64.npy


data64.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model64.npy


model64.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data114.npy


data114.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model114.npy


model114.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data38.npy


data38.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model38.npy


model38.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data30.npy


data30.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model30.npy


model30.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data108.npy


data108.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model108.npy


model108.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data127.npy


data127.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model127.npy


model127.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data2.npy


data2.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model2.npy


model2.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data53.npy


data53.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model53.npy


model53.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data22.npy


data22.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model22.npy


model22.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data3.npy


data3.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model3.npy


model3.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data24.npy


data24.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model24.npy


model24.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data104.npy


data104.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model104.npy


model104.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data100.npy


data100.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model100.npy


model100.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data117.npy


data117.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model117.npy


model117.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data88.npy


data88.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model88.npy


model88.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data124.npy


data124.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model124.npy


model124.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data75.npy


data75.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model75.npy


model75.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data87.npy


data87.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model87.npy


model87.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data83.npy


data83.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model83.npy


model83.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data122.npy


data122.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model122.npy


model122.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data134.npy


data134.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model134.npy


model134.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data21.npy


data21.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model21.npy


model21.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data72.npy


data72.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model72.npy


model72.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data107.npy


data107.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model107.npy


model107.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data15.npy


data15.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model15.npy


model15.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data93.npy


data93.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model93.npy


model93.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Style_B/data/data103.npy


data103.npy:   0%|          | 0.00/700M [00:00<?, ?B/s]

Style_B/model/model103.npy


model103.npy:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

In [8]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
none             30G     0   30G   0% /usr/lib/modules/5.15.167.4-microsoft-standard-WSL2
none             30G  4.0K   30G   1% /mnt/wsl
drivers         930G  725G  206G  78% /usr/lib/wsl/drivers
/dev/sdb        672G  602G   41G  94% /
none             30G   88K   30G   1% /mnt/wslg
none             30G     0   30G   0% /usr/lib/wsl/lib
rootfs           30G  2.4M   30G   1% /init
none             30G     0   30G   0% /run
none             30G     0   30G   0% /run/lock
none             30G  4.0K   30G   1% /run/shm
none             30G     0   30G   0% /run/user
none             30G   76K   30G   1% /mnt/wslg/versions.txt
none             30G   76K   30G   1% /mnt/wslg/doc
C:\             930G  725G  206G  78% /mnt/c
D:\             932G  180M  932G   1% /mnt/d
