In [4]:
import pandas as pd

meta = pd.read_parquet("metadata.parquet")
print(meta.columns)       # 利用可能な項目を確認

# 例: sample_id=0 の場合
duration = meta.loc[meta.sample_id == 0, "audio_info/duration"].iloc[0]
print(duration)  # 秒単位での長さ


Index(['acoustics/c50_db', 'acoustics/drr_db', 'acoustics/edt_ms',
       'acoustics/frequency_bins', 'acoustics/t20_ms', 'acoustics/t30_ms',
       'audio_info/checksum/ambisonics',
       'audio_info/checksum/noise_ambisonics', 'audio_info/duration',
       'audio_info/frames', 'audio_info/size/ambisonics',
       'audio_info/size/noise_ambisonics', 'lite_version', 'noise/azimuth',
       'noise/deep_noise_suppression_metadata/filename',
       'noise/deep_noise_suppression_metadata/is_audioset',
       'noise/deep_noise_suppression_metadata/label',
       'noise/deep_noise_suppression_metadata/youtube_id', 'noise/distance',
       'noise/elevation', 'noise/snr', 'noise/source_id', 'room/floor_area',
       'room/room_id', 'room/surface_area', 'room/volume', 'sample_id',
       'speech/azimuth', 'speech/directivity_id', 'speech/distance',
       'speech/elevation', 'speech/librispeech_metadata/book_id',
       'speech/librispeech_metadata/chapter_id',
       'speech/librispeech_metad

In [6]:
print(meta.head())

                                    acoustics/c50_db  \
0  [-17.32041497023026, -16.359266083519866, -12....   
1  [-16.021165970483054, -13.737698475512165, -12...   
2  [-18.175496191222948, -17.306835853815034, -19...   
3  [-17.508362813693353, -16.856860683549982, -14...   
4  [-16.59338701427417, -15.792701167896723, -12....   

                                    acoustics/drr_db  \
0  [-106.88784169381441, -96.21020819230088, -85....   
1  [-94.556849284041, -82.3024077514615, -77.9493...   
2  [-100.61744618815055, -90.97003289968285, -82....   
3  [-110.41427074231669, -100.89350190121993, -92...   
4  [-100.84837171985211, -90.97729274409367, -82....   

                                    acoustics/edt_ms  \
0  [2341.8692574118872, 1894.8049907171328, 1874....   
1  [2354.5006957921055, 1676.5186551918446, 1880....   
2  [2341.2589960411165, 1970.2121309429288, 2809....   
3  [2434.225849705927, 1803.1517151255453, 1873.0...   
4  [2368.8539658235904, 1822.9932928347541, 14

In [3]:
# 10秒以上のサンプルを抽出
filtered = meta[meta["audio_info/duration"] >= 10]

# サンプル数を表示
print("10秒以上のサンプル数:", len(filtered))

# 必要に応じて一覧も表示
print(filtered[["sample_id", "audio_info/duration"]])


10秒以上のサンプル数: 143195
        sample_id  audio_info/duration
9               9            10.565687
10             10            10.494063
11             11               12.075
12             12            10.440625
14             14             11.61275
...           ...                  ...
221449     221449            18.259812
221452     221452            11.477375
221454     221454             28.85375
221455     221455            20.457625
221456     221456            19.699625

[143195 rows x 2 columns]


In [None]:
meta = pd.read_parquet("metadata.parquet")

# 10秒以上のものだけ
meta_10s = meta[meta["audio_info/duration"] >= 10]

# 保存先フォルダ
save_dir = "./first10sec"
os.makedirs(save_dir, exist_ok=True)

# 音源が存在するフォルダ
audio_dir = "./preSpatialLibriSpeech"

# 新しいメタデータ格納用リスト
new_metadata = []

for idx, row in meta_10s.iterrows():
    fname = f"{idx:06d}.flac"  # 6桁ゼロ埋めファイル名
    path = os.path.join(audio_dir, fname)
    if not os.path.exists(path):
        print(f"{path}: ファイルが存在しません。スキップ")
        continue

    # 音源読み込み
    audio, sr = sf.read(path)
    if audio.ndim != 2 or audio.shape[1] != 4:
        print(f"{fname}: 4ch(FOA)ではないのでスキップ")
        continue
    if sr != 16000:
        print(f"{fname}: サンプリングレートが16kHzでないのでスキップ")
        continue

    # 10秒分だけ抽出
    audio_10s = audio[:160000, :]

    # 新ファイルパス（ファイル名は元のまま＝index番号.flac）
    out_path = os.path.join(save_dir, fname)
    sf.write(out_path, audio_10s, sr, format='FLAC')

    # 新しいメタデータ用辞書
    row_new = row.copy()
    row_new["audio_info/duration"] = 10.0  # 長さを10.0に
    new_metadata.append(row_new)

# DataFrame化（index付きで）
new_meta_df = pd.DataFrame(new_metadata, index=meta_10s.index)
# 元のindexを維持したままCSV保存
new_meta_df.to_csv("first10sec_metadata.csv")


In [1]:
ls first10sec/*.flac | head


first10sec/000009.flac
first10sec/000010.flac
first10sec/000011.flac
first10sec/000012.flac
first10sec/000014.flac
first10sec/000015.flac
first10sec/000016.flac
first10sec/000017.flac
first10sec/000018.flac
first10sec/000019.flac


In [5]:
import pandas as pd
df = pd.read_csv("/home/takamichi-lab-pc09/SpatialLibriSpeech/first10sec_metadata_zfilled.csv", index_col=0)
print(df.index[:5])


Index([9, 10, 11, 12, 14], dtype='int64')


In [3]:
# metadata側indexをゼロ詰め
df = pd.read_csv("first10sec_metadata.csv", index_col=0)
df.index = df.index.map(lambda x: str(x).zfill(6))
df.to_csv("first10sec_metadata_zfilled.csv")

In [6]:
import pandas as pd
df = pd.read_csv("first10sec_metadata.csv", index_col=0)
print(list(df.index[:5]))


[9, 10, 11, 12, 14]


In [7]:
import pandas as pd
df = pd.read_csv("first10sec_metadata.csv", index_col=0)
df.index = df.index.map(lambda x: str(x).zfill(6))  # 6桁ゼロ詰め
df.to_csv("first10sec_metadata_fixed.csv")


In [8]:
import os
print([os.path.splitext(f)[0] for f in os.listdir("first10sec")][:5])


['000014', '000024', '000017', '000019', '000018']


In [9]:
import pandas as pd
df = pd.read_csv("first10sec_metadata_fixed.csv", index_col=0)
print(list(df.index[:5]))


[9, 10, 11, 12, 14]


In [10]:
import pandas as pd
df = pd.read_csv("first10sec_metadata.csv", index_col=0)

# indexをstrに変換してゼロ詰め
df.index = df.index.map(lambda x: str(x).zfill(6))

# 型がstrになっているかを確認
print(type(df.index[0]), df.index[:5])

# 文字列型のindexでCSV保存
df.to_csv("first10sec_metadata_fixed.csv", index=True)


<class 'str'> Index(['000009', '000010', '000011', '000012', '000014'], dtype='object')


In [12]:
df = pd.read_csv("first10sec_metadata_fixed.csv", index_col=0, dtype={'index': str})
print(list(df.index[:5]))


[9, 10, 11, 12, 14]


In [14]:
from foa_dataset import FOALabeledDataset

ds = FOALabeledDataset("first10sec", "first10sec_metadata_fixed.csv")
I_act, I_rea, y = ds[0]
print(I_act.shape, I_rea.shape, y.shape)   # (3,201,1601) (3,201,1601) (44,)


torch.Size([3, 201, 1601]) torch.Size([3, 201, 1601]) torch.Size([44])


In [15]:
SCALAR_COLS = [
  "speech/azimuth",
  "speech/elevation",
  "speech/distance",
  "room/volume",
]
LIST_COLS = [
  "acoustics/drr_db",
  "acoustics/t30_ms",
]


In [16]:
import pandas as pd

# 元CSV
orig_csv = "first10sec_metadata.csv"
# 抜き出すカラム
use_cols = SCALAR_COLS + LIST_COLS

# 読み込み時に usecols を指定する方法
df = pd.read_csv(orig_csv, index_col=0, usecols=["Unnamed: 0"] + use_cols)

# 列名を揃えたらそのまま保存
df.to_csv("first10sec_metadata_trimmed.csv")


In [17]:
import numpy as np

mean = np.load("label_mean.npy")
std  = np.load("label_std.npy")

print("min(std):", std.min(), "max(std):", std.max())
print("any NaN in mean?", np.isnan(mean).any())
print("any NaN in  std ?", np.isnan(std).any())


min(std): nan max(std): nan
any NaN in mean? False
any NaN in  std ? True


In [2]:
import joblib, numpy as np
drt_sc  = joblib.load("fit/drt_scaler.joblib")
scl_sc  = joblib.load("fit/scalar_scaler.joblib")

print("DRR+T30  mean ± std (1st 5 dims):",
      drt_sc.mean_[:5], drt_sc.scale_[:5])
print("scalar   min_ :", scl_sc.data_min_, "\nscalar   max_ :", scl_sc.data_max_)


DRR+T30  mean ± std (1st 5 dims): [-103.97620987  -94.26289211  -85.28248339  -76.88478839  -68.7917131 ] [8.1505089  8.53076373 8.5444584  8.00855176 7.43740301]
scalar   min_ : [39.506966  -3.1415434  0.4990865 -0.8447054] 
scalar   max_ : [9.517879e+02 3.141578e+00 4.023696e+00 8.529371e-01]


In [4]:
import joblib, numpy as np, pandas as pd
from pathlib import Path

drt_sc  = joblib.load("fit/drt_scaler.joblib")
scl_sc  = joblib.load("fit/scalar_scaler.joblib")

def parse_arr(s):  # fit_scalers.py と同じ関数
    import re
    nums = re.findall(r"[-+]?\d+\.?\d*(?:e[-+]?\d+)?", s)
    return np.asarray(nums, dtype=np.float32)

df = pd.read_csv("trimmed＿first10sec_metadata.csv")
row  = df.sample(1).iloc[0]           # ランダム 1 行

drr  = parse_arr(row["acoustics/drr_db"])
t30  = parse_arr(row["acoustics/t30_ms"])
sc   = np.array([row["room/volume"],
                 row["speech/azimuth"],
                 row["speech/distance"],
                 row["speech/elevation"]], dtype=np.float32)

# 変換
x_drt = np.hstack([drr, t30]).reshape(1, -1)
x_scl = sc.reshape(1, -1)

z_drt = drt_sc.transform(x_drt)[0]
z_scl = scl_sc.transform(x_scl)[0]

print("DRR/T30  ->  mean {:.4f}  std {:.4f}".format(z_drt.mean(), z_drt.std()))
print("scalars  ->  min {:.3f}  max {:.3f}".format(z_scl.min(), z_scl.max()))


DRR/T30  ->  mean -0.0734  std 1.0689
scalars  ->  min 0.125  max 0.636


In [5]:
import joblib, pandas as pd, numpy as np, re, torch

# ---------- 事前に保存したスケーラをロード ----------
drt_sc  = joblib.load("fit/drt_scaler.joblib")       # DRR+T30 用 StandardScaler
vd_sc   = joblib.load("fit/scalar_scaler.joblib")    # volume+distance 用 MinMax

# ---------- 1 行ランダムに取って検証 ----------
df = pd.read_csv("/home/takamichi-lab-pc09/SpatialLibriSpeech/trimmed＿first10sec_metadata.csv")

def parse_arr(txt):
    nums = re.findall(r"[-+]?\d+\.?\d*(?:e[-+]?\d+)?", txt.replace("\n"," "))
    return np.asarray(nums, dtype=np.float32)

row = df.sample(1).iloc[0]

drr = parse_arr(row["acoustics/drr_db"])
t30 = parse_arr(row["acoustics/t30_ms"])

vol  = float(row["room/volume"])
dist = float(row["speech/distance"])
az   = float(row["speech/azimuth"])
el   = float(row["speech/elevation"])

# ---------- 正規化 ----------
drt_norm   = drt_sc.transform(np.hstack([drr, t30])[None])[0]
voldist_n  = vd_sc.transform(np.array([[vol, dist]], dtype=np.float32))[0]  # (vol_norm, dist_norm)

# ---------- 結合（モデルと同順ならOK） ----------
y_norm = np.concatenate([drt_norm, voldist_n, [az, el]])

# ---------- 検証 ----------
print("DRR/T30  mean {:.3f}  std {:.3f}".format(drt_norm.mean(), drt_norm.std()))
print("volume,distance  -> ", voldist_n)              # 0〜1 のはず
print("azimuth,elevation-> ", az, el)                 # 元の度数そのまま
print("NaN or Inf?      -> ", np.isfinite(y_norm).all())


DRR/T30  mean -0.383  std 0.547
volume,distance  ->  [0.17957707 0.6135918 ]
azimuth,elevation->  -0.1602724514273426 0.1019848178375966
NaN or Inf?      ->  True


In [6]:
#!/usr/bin/env python3
"""
python sanity_check.py metadata.csv scalers_dir/
戻り値 0 : 問題なし
戻り値 1 : NaN/Inf を含む行あり（行番号を表示）
"""
import sys, re, joblib, numpy as np, pandas as pd

csv_path, sc_dir = sys.argv[1], sys.argv[2]
drt_sc  = joblib.load(f"{sc_dir}/drt_scaler.joblib")      # 40 dim
vd_sc   = joblib.load(f"{sc_dir}/scalar_scaler.joblib")   # 2 dim

pat = re.compile(r"[-+]?\d+\.?\d*(?:e[-+]?\d+)?")
def arr20(txt):                # 33→20 スライス
    v = np.asarray([float(x) for x in pat.findall(txt.replace("\n"," "))], dtype=np.float32)
    return v[9:29] if v.size==33 else None

df = pd.read_csv(csv_path, dtype=str)
bad_rows = []

for idx, row in df.iterrows():
    drr = arr20(row["acoustics/drr_db"])
    t30 = arr20(row["acoustics/t30_ms"])
    if drr is None or t30 is None:
        bad_rows.append(idx); continue

    # --- スケール変換 ---
    drt_norm = drt_sc.transform(np.hstack([drr, t30]).reshape(1,-1))[0]
    vol  = float(row["room/volume"]); dist = float(row["speech/distance"])
    vd_norm = vd_sc.transform([[vol, dist]])[0]

    # --- NaN / Inf チェック ---
    y = np.concatenate([[row["speech/azimuth"], row["speech/elevation"]],
                        vd_norm, drt_norm])
    if not np.isfinite(y).all():
        bad_rows.append(idx)

if bad_rows:
    print("❌ 問題のある行: ", bad_rows[:20], " ...")   # 多い場合は先頭20だけ表示
    sys.exit(1)
print("✅ 全行 OK — NaN/Inf なし")


IndexError: list index out of range

In [14]:
import pandas as pd
df = pd.read_csv("trimmed＿first10sec_metadata.csv")
df = pd.read_csv("metadata_clean.csv")
orig_len = len(df)
clean_len = len(df[df["acoustics/drr_db"] != "[]"])


In [13]:
import pandas as pd, numpy as np, re

idx = 13678                                               # 問題の行番号
pat = re.compile(r"[-+]?\d+\.?\d*(?:e[-+]?\d+)?")
df = pd.read_csv("/home/takamichi-lab-pc09/SpatialLibriSpeech/metadata_clean.csv", dtype=str)

row = df.iloc[idx]
print("行インデックス:", idx, "\n--- scalar ---")
for c in ["room/volume","speech/azimuth","speech/distance","speech/elevation"]:
    print(c, row[c])

print("\n--- drr_db / t30_ms （抜粋）---")
for col in ["acoustics/drr_db", "acoustics/t30_ms"]:
    nums = [float(x) for x in pat.findall(row[col].replace("\n"," "))]
    print(col, "\n ", nums[8:12], "...", "len=", len(nums))


行インデックス: 13678 
--- scalar ---
room/volume 332.6302795410156
speech/azimuth 0.0011599355045488
speech/distance 2.377574990462681
speech/elevation 0.0680211586246528

--- drr_db / t30_ms （抜粋）---
acoustics/drr_db 
  [-52.58389815, -47.59734406, -43.51650474, -35.83435949] ... len= 33
acoustics/t30_ms 
  [649.4296119, 553.76746021, 603.9556307, 487.49688502] ... len= 33


In [18]:
import pandas as pd
bad = [13678]                       # 問題行インデックス
df = pd.read_csv("/home/takamichi-lab-pc09/SpatialLibriSpeech/trimmed＿first10sec_metadata.csv")

df_clean= pd.read_csv("metadata_clean.csv")

print(len(df_clean))  # 13679 - 1 = 13678
print(len(df))  # 1



134682
134683
