##### Note file 초기화

- data pair마다 분석시 note를 추가하여 데이테 셋 마다 특징을 적어두는 게 좋을 것이라 판단.
- 정확히 서로의 어디가 어떻게 다른지 분석하여 적어둘 필요 존재.
- 얇고 넓게 알기 위해서 정말 쓸때 없어 보이는 데이터도 혹시 모르니 전부 생성해두는 것이 좋을 것이라 판단.


- spectrogram만으로는 보기 어려울 수 있으니 여러가지 feature를 더 알아보도록 하자.
    - class
    - normal
    - anomaly
    - std
    - avg
    - ssim
    - corss-crrelataion
    - Spectral Centroid
    - Spectral Bandwidth
    - Spectral Flatness
    - Spectral Roll-off
    - RMS (Root Mean Square) 에너지
    - Wavelet Transform
    - power spectrum

다음과 같이 열로 정리해서 넣는다.
    

In [19]:
!pip install pywavelets

Collecting pywavelets
  Downloading pywavelets-1.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Downloading pywavelets-1.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pywavelets
Successfully installed pywavelets-1.6.0


In [24]:
import os
import pandas as pd
from tqdm import tqdm
import librosa
import numpy as np
from skimage.metrics import structural_similarity as ssim
from scipy.signal import correlate
from scipy.signal import cwt, ricker
from scipy.fft import fft
import pywt

# -----------------------------
# Configuration Parameters
# -----------------------------
DATASETS_DIR = "../../../datasets/dev"  # 데이터셋 디렉토리 경로
OUTPUT_CSV_FILE = "all_class_matching_pairs_with_features.csv"  # 결과를 저장할 CSV 파일 이름

# -----------------------------
# Step 1: Load Dataset with Correct Paths
# -----------------------------
def load_dataset(attributes_file, datasets_dir):
    if not os.path.isfile(attributes_file):
        raise FileNotFoundError(f"Attributes file not found: {attributes_file}")

    df = pd.read_csv(attributes_file)
    filenames = df['file_name'].tolist()
    labels = ['anomaly' if 'anomaly' in name.lower() else 'normal' for name in filenames]
    
    print(filenames[:5])
    print(datasets_dir)
    # 파일 경로를 생성할 때 datasets_dir, class_name, 파일명을 합침
    file_paths = [os.path.join(datasets_dir, f) for f in filenames]

    return file_paths, labels

# -----------------------------
# Step 2: Find Matching Normal File Based on Filename Patterns
# -----------------------------
def find_matching_normal_file(anomaly_path, normal_paths):
    anomaly_filename = os.path.basename(anomaly_path)
    anomaly_parts = anomaly_filename.split("_")

    # 파일명 구조에 맞게 인덱스 조정
    anomaly_number = anomaly_parts[5] if len(anomaly_parts) > 5 else None
    anomaly_code = anomaly_parts[6] if len(anomaly_parts) > 6 else None

    if anomaly_number is None:
        print(f"Unexpected file name format: {anomaly_filename}. No direct match found for anomaly file: {anomaly_path}")
        return None

    for normal_path in normal_paths:
        normal_filename = os.path.basename(normal_path)
        normal_parts = normal_filename.split("_")

        normal_number = normal_parts[5] if len(normal_parts) > 5 else None
        normal_code = normal_parts[6] if len(normal_parts) > 6 else None

        if anomaly_code is None:
            if anomaly_number == normal_number:
                return normal_path
        else:
            if anomaly_number == normal_number and anomaly_code == normal_code:
                return normal_path

    # 매칭되는 정상 파일이 없는 경우
    return None

# -----------------------------
# 추가: 특징 계산 함수
# -----------------------------
def calculate_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    features = {}
    
    # 기존 통계 및 스펙트럼 특징
    features['std'] = np.std(y)
    features['avg'] = np.mean(y)
    features['rms'] = librosa.feature.rms(y=y).mean()
    features['spectral_centroid'] = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
    features['spectral_bandwidth'] = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean()
    features['spectral_flatness'] = librosa.feature.spectral_flatness(y=y).mean()
    features['spectral_rolloff'] = librosa.feature.spectral_rolloff(y=y, sr=sr).mean()
    features['wavelet'] = calculate_wavelet_energy(y, wavelet='cmor1.5-1.0', scales=np.arange(1, 31))

    
    # Powerspectrum 계산
    fft_result = fft(y)  # FFT 수행
    power_spectrum = np.abs(fft_result)**2  # FFT 결과의 제곱
    features['powerspectrum'] = power_spectrum.mean()  # 평균 파워 추가
    
    features['signal'] = y  # cross_correlation 계산을 위해 신호 저장
    features['sr'] = sr  # 샘플링 레이트 저장
    features['spectrogram'] = calculate_spectrogram(y, sr)  # ssim 계산을 위해 스펙트로그램 저장
    return features


def calculate_wavelet_energy(signal, wavelet='cmor1.5-1.0', scales=np.arange(1, 31)):
    # CWT 계산 (PyWavelets)
    coefficients, _ = pywt.cwt(signal, scales, wavelet)

    # 에너지 계산 (계수의 제곱 합)
    energy = np.sum(np.square(np.abs(coefficients)))
    return energy


# 추가: 스펙트로그램 계산 함수
def calculate_spectrogram(y, sr, n_fft=512, hop_length=256):
    S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length))
    S_db = librosa.amplitude_to_db(S, ref=np.max)
    return S_db


# -----------------------------
# Step 3: Create Pairs and Save to CSV
# -----------------------------
def create_and_save_pairs_with_features(datasets_dir, output_file):
    all_pairs = []

    # 모든 스펙트로그램과 신호를 저장할 딕셔너리
    all_features = {}

    # 데이터셋 디렉토리 내의 각 클래스를s 순회
    for class_name in os.listdir(datasets_dir):
        class_dir = os.path.join(datasets_dir, class_name)
        if not os.path.isdir(class_dir):
            continue

        print(f"Processing class: {class_name}")

        attributes_file = os.path.join(datasets_dir, class_name ,"attributes_00.csv")
        try:
            # 수정된 load_dataset 함수 호출
            file_paths, labels = load_dataset(attributes_file, datasets_dir)
            anomaly_paths = [path for path, label in zip(file_paths, labels) if label == 'anomaly']
            normal_paths = [path for path, label in zip(file_paths, labels) if label == 'normal']
        except FileNotFoundError:
            print(f"Attributes file not found for class: {class_name}")
            continue

        # 모든 파일의 특징을 미리 계산하여 저장
        for path in tqdm(file_paths, desc=f"Calculating features for {class_name}"):
            if not os.path.isfile(path):
                print(f"File not found: {path}. Skipping.")
                continue
            try:
                features = calculate_features(path)
                all_features[path] = features
            except Exception as e:
                print(f"Error processing {path}: {e}. Skipping.")
                continue

        # 각 이상치 파일에 대해 매칭되는 정상 파일을 찾음
        for anomaly_path in tqdm(anomaly_paths, desc=f"Matching pairs in {class_name}"):
            matching_normal_path = find_matching_normal_file(anomaly_path, normal_paths)

            if matching_normal_path:
                anomaly_features = all_features.get(anomaly_path)
                normal_features = all_features.get(matching_normal_path)

                if anomaly_features is None or normal_features is None:
                    print(f"Features missing for pair: {anomaly_path}, {matching_normal_path}. Skipping.")
                    continue

                pair_data = {
                    "class": class_name,
                    "normal": matching_normal_path,
                    "anomaly": anomaly_path,
                    "handwrite": "",  # handwrite 필드를 빈 문자열로 추가
                    "std": anomaly_features['std'],
                    "avg": anomaly_features['avg'],
                    "ssim": "",  # ssim은 나중에 계산
                    "powerspectrum": anomaly_features['powerspectrum'], 
                    "cross_correlation": "",  # cross_correlation도 나중에 계산
                    "Spectral Features": f"{anomaly_features['spectral_centroid']}, {anomaly_features['spectral_bandwidth']}, {anomaly_features['spectral_flatness']}, {anomaly_features['spectral_rolloff']}",
                    "Spectral Centroid": anomaly_features['spectral_centroid'],
                    "Spectral Bandwidth": anomaly_features['spectral_bandwidth'],
                    "Spectral Flatness": anomaly_features['spectral_flatness'],
                    "Spectral Roll-off": anomaly_features['spectral_rolloff'],
                    "RMS (Root Mean Square)": anomaly_features['rms'],
                    "Wavelet Transform": anomaly_features['wavelet'],
                    "method": "Filename Matching",
                }

                all_pairs.append(pair_data)
            else:
                print(f"No matching normal file found for anomaly: {anomaly_path}")

    # DataFrame 생성
    all_pairs_df = pd.DataFrame(all_pairs)
    all_pairs_df.to_csv(output_file, index=False)
    print(f"Initial CSV file saved to {output_file}")


# -----------------------------
# 추가: ssim 및 cross_correlation 계산 함수
# -----------------------------
def calculate_ssim_and_cross_corr(all_pairs, output_file):
    # ssim 및 cross_correlation 값을 저장할 리스트
    ssim_list = []
    cross_corr_list = []

    for pair in tqdm(all_pairs, desc="Calculating ssim and cross_correlation"):
        # 스펙트로그램 가져오기
        anomaly_spec = pair['anomaly_spectrogram']
        normal_spec = pair['normal_spectrogram']

        # ssim 계산
        try:
            ssim_value, _ = ssim(anomaly_spec, normal_spec, full=True, data_range=anomaly_spec.max() - anomaly_spec.min())
        except ValueError:
            ssim_value = np.nan  # 스펙트로그램 크기가 다를 경우 NaN 처리

        # cross_correlation 계산
        anomaly_signal = pair['anomaly_signal']
        normal_signal = pair['normal_signal']

        # 신호 길이를 맞추기 위해 짧은 쪽에 맞춤
        min_length = min(len(anomaly_signal), len(normal_signal))
        anomaly_signal = anomaly_signal[:min_length]
        normal_signal = normal_signal[:min_length]

        cross_corr = np.correlate(anomaly_signal, normal_signal, mode='valid')[0]

        ssim_list.append(ssim_value)
        cross_corr_list.append(cross_corr)

    # DataFrame에 값 추가
    df = pd.read_csv(output_file)
    df['ssim'] = ssim_list
    df['cross_correlation'] = cross_corr_list

    # 최종 CSV 저장
    df.to_csv(output_file, index=False)
    print(f"Final CSV file with ssim and cross_correlation saved to {output_file}")

# -----------------------------
# Main Execution
# -----------------------------
if __name__ == "__main__":
    create_and_save_pairs_with_features(DATASETS_DIR, OUTPUT_CSV_FILE)


Processing class: ToyCar
['ToyCar/test/section_00_source_test_anomaly_0001_car_B1_spd_31V_mic_1.wav', 'ToyCar/test/section_00_source_test_anomaly_0002_car_B1_spd_34V_mic_1.wav', 'ToyCar/test/section_00_source_test_anomaly_0003_car_B1_spd_34V_mic_1.wav', 'ToyCar/test/section_00_source_test_anomaly_0004_car_B1_spd_40V_mic_1.wav', 'ToyCar/test/section_00_source_test_anomaly_0005_car_B1_spd_31V_mic_1.wav']
../../../datasets/dev


Calculating features for ToyCar:   0%|          | 0/1200 [00:00<?, ?it/s]

Calculating features for ToyCar: 100%|██████████| 1200/1200 [05:10<00:00,  3.87it/s]
Matching pairs in ToyCar: 100%|██████████| 100/100 [00:00<00:00, 59116.34it/s]


Processing class: gearbox
['gearbox/test/section_00_source_test_anomaly_0001_noAttribute.wav', 'gearbox/test/section_00_source_test_anomaly_0002_noAttribute.wav', 'gearbox/test/section_00_source_test_anomaly_0003_noAttribute.wav', 'gearbox/test/section_00_source_test_anomaly_0004_noAttribute.wav', 'gearbox/test/section_00_source_test_anomaly_0005_noAttribute.wav']
../../../datasets/dev


Calculating features for gearbox: 100%|██████████| 1200/1200 [04:21<00:00,  4.59it/s]
Matching pairs in gearbox: 100%|██████████| 100/100 [00:00<00:00, 68747.81it/s]


Processing class: valve
['valve/test/section_00_source_test_anomaly_0001_v1pat_none_v2pat_B.wav', 'valve/test/section_00_source_test_anomaly_0002_v1pat_none_v2pat_B.wav', 'valve/test/section_00_source_test_anomaly_0003_v1pat_none_v2pat_B.wav', 'valve/test/section_00_source_test_anomaly_0004_v1pat_none_v2pat_B.wav', 'valve/test/section_00_source_test_anomaly_0005_v1pat_none_v2pat_B.wav']
../../../datasets/dev


Calculating features for valve: 100%|██████████| 1200/1200 [04:18<00:00,  4.64it/s]
Matching pairs in valve: 100%|██████████| 100/100 [00:00<00:00, 64142.90it/s]


Processing class: bearing
['bearing/test/section_00_source_test_anomaly_0001_pro_A_vel_4_loc_A.wav', 'bearing/test/section_00_source_test_anomaly_0002_pro_A_vel_4_loc_A.wav', 'bearing/test/section_00_source_test_anomaly_0003_pro_A_vel_12_loc_A.wav', 'bearing/test/section_00_source_test_anomaly_0004_pro_A_vel_12_loc_A.wav', 'bearing/test/section_00_source_test_anomaly_0005_pro_A_vel_12_loc_A.wav']
../../../datasets/dev


Calculating features for bearing: 100%|██████████| 1200/1200 [04:15<00:00,  4.70it/s]
Matching pairs in bearing: 100%|██████████| 100/100 [00:00<00:00, 63559.69it/s]


Processing class: slider
['slider/test/section_00_source_test_anomaly_0001_noAttribute.wav', 'slider/test/section_00_source_test_anomaly_0002_noAttribute.wav', 'slider/test/section_00_source_test_anomaly_0003_noAttribute.wav', 'slider/test/section_00_source_test_anomaly_0004_noAttribute.wav', 'slider/test/section_00_source_test_anomaly_0005_noAttribute.wav']
../../../datasets/dev


Calculating features for slider: 100%|██████████| 1200/1200 [04:17<00:00,  4.66it/s]
Matching pairs in slider: 100%|██████████| 100/100 [00:00<00:00, 68523.18it/s]


Processing class: fan
['fan/test/section_00_source_test_anomaly_0001_n_A.wav', 'fan/test/section_00_source_test_anomaly_0002_n_A.wav', 'fan/test/section_00_source_test_anomaly_0003_n_A.wav', 'fan/test/section_00_source_test_anomaly_0004_n_A.wav', 'fan/test/section_00_source_test_anomaly_0005_n_A.wav']
../../../datasets/dev


Calculating features for fan: 100%|██████████| 1200/1200 [04:17<00:00,  4.66it/s]
Matching pairs in fan: 100%|██████████| 100/100 [00:00<00:00, 67770.30it/s]


Processing class: ToyTrain
['ToyTrain/test/section_00_source_test_anomaly_0001_noAttribute.wav', 'ToyTrain/test/section_00_source_test_anomaly_0002_noAttribute.wav', 'ToyTrain/test/section_00_source_test_anomaly_0003_noAttribute.wav', 'ToyTrain/test/section_00_source_test_anomaly_0004_noAttribute.wav', 'ToyTrain/test/section_00_source_test_anomaly_0005_noAttribute.wav']
../../../datasets/dev


Calculating features for ToyTrain: 100%|██████████| 1200/1200 [05:12<00:00,  3.85it/s]
Matching pairs in ToyTrain: 100%|██████████| 100/100 [00:00<00:00, 68300.02it/s]

Initial CSV file saved to all_class_matching_pairs_with_features.csv





##### 한번에 csv에 저장하기에는 매우 큰 폴더임.

- 클래스 별로 나누어서 저장한다면 훨씬 나을 것이라 판단. 

# WIP

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import librosa
import numpy as np
from skimage.metrics import structural_similarity as ssim
from scipy.fft import fft
import pywt
import warnings

# Suppress warnings from librosa (optional)
warnings.filterwarnings('ignore')

# -----------------------------
# Configuration Parameters
# -----------------------------
DATASETS_DIR = "../../../datasets/dev"  # 데이터셋 디렉토리 경로
FEATURES_DIR = "features"              # 특징 CSV 파일 저장 디렉토리
HANDWRITE_DIR = "handwrite"            # Handwrite CSV 파일 저장 디렉토리

# 폴더 생성
os.makedirs(FEATURES_DIR, exist_ok=True)
os.makedirs(HANDWRITE_DIR, exist_ok=True)

# -----------------------------
# Step 1: Load Dataset
# -----------------------------
def load_dataset(attributes_file, datasets_dir):
    if not os.path.isfile(attributes_file):
        raise FileNotFoundError(f"Attributes file not found: {attributes_file}")

    df = pd.read_csv(attributes_file)
    filenames = df['file_name'].tolist()
    labels = ['anomaly' if 'anomaly' in name.lower() else 'normal' for name in filenames]
    
    # 파일 경로를 생성할 때 datasets_dir, 파일명을 합침
    file_paths = [os.path.join(datasets_dir, f) for f in filenames]

    return file_paths, labels

# -----------------------------
# Step 2: Find Matching Normal File Based on Filename Patterns
# -----------------------------
def find_matching_normal_file(anomaly_path, normal_paths):
    anomaly_filename = os.path.basename(anomaly_path)
    anomaly_parts = anomaly_filename.split("_")

    # 파일명 구조에 맞게 인덱스 조정 (필요에 따라 변경)
    # 예시: parts[5] = number, parts[6] = code
    anomaly_number = anomaly_parts[5] if len(anomaly_parts) > 5 else None
    anomaly_code = anomaly_parts[6] if len(anomaly_parts) > 6 else None

    if anomaly_number is None:
        print(f"Unexpected file name format: {anomaly_filename}. No direct match found for anomaly file: {anomaly_path}")
        return None

    for normal_path in normal_paths:
        normal_filename = os.path.basename(normal_path)
        normal_parts = normal_filename.split("_")

        normal_number = normal_parts[5] if len(normal_parts) > 5 else None
        normal_code = normal_parts[6] if len(normal_parts) > 6 else None

        if anomaly_code is None:
            if anomaly_number == normal_number:
                return normal_path
        else:
            if anomaly_number == normal_number and anomaly_code == normal_code:
                return normal_path

    # 매칭되는 정상 파일이 없는 경우
    return None

# -----------------------------
# Step 3: Calculate Features
# -----------------------------
def calculate_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    features = {}
    
    # 기존 통계 및 스펙트럼 특징
    features['std'] = np.std(y)
    features['avg'] = np.mean(y)
    features['rms'] = librosa.feature.rms(y=y).mean()
    features['spectral_centroid'] = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
    features['spectral_bandwidth'] = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean()
    features['spectral_flatness'] = librosa.feature.spectral_flatness(y=y).mean()
    features['spectral_rolloff'] = librosa.feature.spectral_rolloff(y=y, sr=sr).mean()
    features['wavelet'] = calculate_wavelet_energy(y, wavelet='cmor1.5-1.0', scales=np.arange(1, 31))

    # Powerspectrum 계산
    fft_result = fft(y)  # FFT 수행
    power_spectrum = np.abs(fft_result)**2  # FFT 결과의 제곱
    features['powerspectrum'] = power_spectrum.mean()  # 평균 파워 추가
    
    features['signal'] = y  # cross_correlation 계산을 위해 신호 저장
    features['sr'] = sr      # 샘플링 레이트 저장
    features['spectrogram'] = calculate_spectrogram(y, sr)  # ssim 계산을 위해 스펙트로그램 저장
    return features

def calculate_wavelet_energy(signal, wavelet='cmor1.5-1.0', scales=np.arange(1, 31)):
    # CWT 계산 (PyWavelets)
    coefficients, _ = pywt.cwt(signal, scales, wavelet)

    # 에너지 계산 (계수의 제곱 합)
    energy = np.sum(np.square(np.abs(coefficients)))
    return energy

def calculate_spectrogram(y, sr, n_fft=512, hop_length=256):
    S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length))
    S_db = librosa.amplitude_to_db(S, ref=np.max)
    return S_db

# -----------------------------
# Step 4: Create Pairs and Save to CSV Per Class
# -----------------------------
def create_and_save_pairs_with_features(datasets_dir, features_dir, handwrite_dir):
    # 데이터셋 디렉토리 내의 각 클래스를 순회
    for class_name in os.listdir(datasets_dir):
        class_dir = os.path.join(datasets_dir, class_name)
        if not os.path.isdir(class_dir):
            continue

        print(f"Processing class: {class_name}")

        attributes_file = os.path.join(class_dir, "attributes_00.csv")
        try:
            # load_dataset 함수 호출
            file_paths, labels = load_dataset(attributes_file, datasets_dir)
            anomaly_paths = [path for path, label in zip(file_paths, labels) if label == 'anomaly']
            normal_paths = [path for path, label in zip(file_paths, labels) if label == 'normal']
        except FileNotFoundError:
            print(f"Attributes file not found for class: {class_name}")
            continue

        # 모든 파일의 특징을 미리 계산하여 저장
        all_features = {}
        for path in tqdm(file_paths, desc=f"Calculating features for {class_name}"):
            if not os.path.isfile(path):
                print(f"File not found: {path}. Skipping.")
                continue
            try:
                features = calculate_features(path)
                all_features[path] = features
            except Exception as e:
                print(f"Error processing {path}: {e}. Skipping.")
                continue

        # 각 이상치 파일에 대해 매칭되는 정상 파일을 찾음
        features_output = os.path.join(features_dir, f"features_{class_name}.csv")
        handwrite_output = os.path.join(handwrite_dir, f"handwrite_{class_name}.csv")

        # CSV 파일이 이미 존재하면 삭제 (처음 실행 시)
        if os.path.isfile(features_output):
            os.remove(features_output)

        # 컬럼 정의
        columns = [
            "pair_id", "class", "normal", "anomaly", "handwrite", "std", "avg", "ssim",
            "powerspectrum", "cross_correlation", "Spectral Features",
            "Spectral Centroid", "Spectral Bandwidth", "Spectral Flatness",
            "Spectral Roll-off", "RMS (Root Mean Square)", "Wavelet Transform",
            "method"
        ]

        # 빈 DataFrame을 생성하고 CSV로 저장 (헤더 포함)
        pd.DataFrame(columns=columns).to_csv(features_output, index=False)

        # Handwrite CSV 파일 초기화 (비어있는 상태)
        if not os.path.isfile(handwrite_output):
            pd.DataFrame(columns=["pair_id", "note"]).to_csv(handwrite_output, index=False)

        for anomaly_path in tqdm(anomaly_paths, desc=f"Matching pairs in {class_name}"):
            matching_normal_path = find_matching_normal_file(anomaly_path, normal_paths)

            if matching_normal_path:
                anomaly_features = all_features.get(anomaly_path)
                normal_features = all_features.get(matching_normal_path)

                if anomaly_features is None or normal_features is None:
                    print(f"Features missing for pair: {anomaly_path}, {matching_normal_path}. Skipping.")
                    continue

                # pair_id 생성 (예: anomaly 파일명 + normal 파일명)
                pair_id = f"{os.path.basename(anomaly_path)}_{os.path.basename(matching_normal_path)}"

                pair_data = {
                    "pair_id": pair_id,
                    "class": class_name,
                    "normal": matching_normal_path,
                    "anomaly": anomaly_path,
                    "handwrite": "",  # handwrite 필드를 빈 문자열로 추가
                    "std": anomaly_features['std'],
                    "avg": anomaly_features['avg'],
                    "ssim": "",  # ssim은 나중에 계산
                    "powerspectrum": anomaly_features['powerspectrum'], 
                    "cross_correlation": "",  # cross_correlation도 나중에 계산
                    "Spectral Features": f"{anomaly_features['spectral_centroid']}, {anomaly_features['spectral_bandwidth']}, {anomaly_features['spectral_flatness']}, {anomaly_features['spectral_rolloff']}",
                    "Spectral Centroid": anomaly_features['spectral_centroid'],
                    "Spectral Bandwidth": anomaly_features['spectral_bandwidth'],
                    "Spectral Flatness": anomaly_features['spectral_flatness'],
                    "Spectral Roll-off": anomaly_features['spectral_rolloff'],
                    "RMS (Root Mean Square)": anomaly_features['rms'],
                    "Wavelet Transform": anomaly_features['wavelet'],
                    "method": "Filename Matching",
                }

                # DataFrame 생성 후 CSV에 추가 (append 모드)
                pd.DataFrame([pair_data]).to_csv(features_output, mode='a', header=False, index=False)
            else:
                print(f"No matching normal file found for anomaly: {anomaly_path}")

# -----------------------------
# Step 5: Calculate SSIM and Cross Correlation
# -----------------------------
def calculate_ssim_and_cross_corr(features_dir):
    for class_name in os.listdir(features_dir):
        features_file = os.path.join(features_dir, f"features_{class_name}.csv")
        if not os.path.isfile(features_file):
            continue

        df = pd.read_csv(features_file)

        ssim_list = []
        cross_corr_list = []

        for index, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Calculating ssim and cross_corr for {class_name}"):
            anomaly_path = row['anomaly']
            normal_path = row['normal']

            # 스펙트로그램 로드
            try:
                y_anomaly, sr_anomaly = librosa.load(anomaly_path, sr=None)
                y_normal, sr_normal = librosa.load(normal_path, sr=None)

                S_anomaly = calculate_spectrogram(y_anomaly, sr_anomaly)
                S_normal = calculate_spectrogram(y_normal, sr_normal)
            except Exception as e:
                print(f"Error loading spectrograms for pair: {anomaly_path}, {normal_path}. Setting ssim to NaN.")
                ssim_list.append(np.nan)
                cross_corr_list.append(np.nan)
                continue

            # ssim 계산
            try:
                ssim_value = ssim(S_anomaly, S_normal, data_range=S_anomaly.max() - S_anomaly.min())
            except ValueError:
                ssim_value = np.nan  # 스펙트로그램 크기가 다를 경우 NaN 처리

            ssim_list.append(ssim_value)

            # cross_correlation 계산
            try:
                # 신호 로드
                anomaly_signal, _ = librosa.load(anomaly_path, sr=None)
                normal_signal, _ = librosa.load(normal_path, sr=None)

                # 신호 길이를 맞추기 위해 짧은 쪽에 맞춤
                min_length = min(len(anomaly_signal), len(normal_signal))
                anomaly_signal = anomaly_signal[:min_length]
                normal_signal = normal_signal[:min_length]

                cross_corr = np.correlate(anomaly_signal, normal_signal, mode='valid')[0]
            except Exception as e:
                print(f"Error calculating cross_correlation for pair: {anomaly_path}, {normal_path}. Setting cross_correlation to NaN.")
                cross_corr = np.nan

            cross_corr_list.append(cross_corr)

        # ssim 및 cross_corr 컬럼 업데이트
        df['ssim'] = ssim_list
        df['cross_correlation'] = cross_corr_list

        # 수정된 DataFrame을 저장
        df.to_csv(features_file, index=False)
        print(f"Updated {features_file} with ssim and cross_correlation")

# -----------------------------
# Main Execution
# -----------------------------
if __name__ == "__main__":
    create_and_save_pairs_with_features(DATASETS_DIR, FEATURES_DIR, HANDWRITE_DIR)
    calculate_ssim_and_cross_corr(FEATURES_DIR)
