In [None]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

import tensorflow as tf
from tensorflow.keras import layers, models, regularizers

In [None]:
def adjust_volume(y, factor):
    return y * factor

In [None]:
def extract_mfcc(file_path, n_mfcc=40, hop_length=64, fixed_length=160, sr=22050, duration=0.5, augment=False):
    try:
        y, sr = librosa.load(file_path, sr=sr, duration=duration)
        y = librosa.util.fix_length(y, size=int(sr * duration))

        if augment:
            augmentation_methods = ['noise', 'pitch', 'speed']
            chosen_method = np.random.choice(augmentation_methods)
            if chosen_method == 'noise':
                y = adjust_volume(y,0.8)
            # elif chosen_method == 'speed':
            #     speed_factor = np.random.uniform(0.8, 1.25)
            #     y = change_speed(y, speed_factor=speed_factor)

        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length)
        mfcc = librosa.power_to_db(mfcc, ref=np.max)

        if mfcc.shape[1] < fixed_length:
            pad_width = fixed_length - mfcc.shape[1]
            mfcc = np.pad(mfcc, pad_width=((0,0), (0, pad_width)), mode='constant')
        else:
            mfcc = mfcc[:, :fixed_length]

        return mfcc.astype(np.float32)

    except Exception as e:
        print(f"處理文件 {file_path} 時出現錯誤: {e}")
        return None

def extract_mfcc_from_data(y_segment, sr, n_mfcc=40, hop_length=64, fixed_length=160):
    try:
        # 提取 MFCC
        mfcc = librosa.feature.mfcc(y=y_segment, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length)
        mfcc = librosa.power_to_db(mfcc, ref=np.max)
        
        # 填充或截斷時間步數
        if mfcc.shape[1] < fixed_length:
            pad_width = fixed_length - mfcc.shape[1]
            mfcc = np.pad(mfcc, pad_width=((0,0), (0, pad_width)), mode='constant')
        else:
            mfcc = mfcc[:, :fixed_length]
        
        return mfcc.astype(np.float32)
    
    except Exception as e:
        print(f"處理音訊片段時出現錯誤: {e}")
        return None

In [None]:
def create_dataset(swallow_dir, non_dir, n_mfcc=40, hop_length=64, fixed_length=160, sr=22050, duration=0.5, augment=False):
    data = []
    labels = []

    for file in os.listdir(swallow_dir):
        if file.endswith('.wav'):
            file_path = os.path.join(swallow_dir, file)
            mfcc = extract_mfcc(file_path, n_mfcc=n_mfcc, hop_length=hop_length, 
                               fixed_length=fixed_length, sr=sr, duration=duration, augment=augment)
            if mfcc is not None:
                data.append(mfcc)
                labels.append(1)

    for file in os.listdir(non_dir):
        if file.endswith('.wav'):
            file_path = os.path.join(non_dir, file)
            mfcc = extract_mfcc(file_path, n_mfcc=n_mfcc, hop_length=hop_length, 
                               fixed_length=fixed_length, sr=sr, duration=duration, augment=augment)
            if mfcc is not None:
                data.append(mfcc)
                labels.append(0)

    return np.array(data), np.array(labels)


In [None]:
# 定義資料夾路徑
swallow_dir = 'sound_split_data/swallow'  # 替換為實際路徑
non_dir = 'sound_split_data/non'          # 替換為實際路徑

# 確認資料夾存在
assert os.path.exists(swallow_dir), f"資料夾 '{swallow_dir}' 不存在。請確認路徑正確。"
assert os.path.exists(non_dir), f"資料夾 '{non_dir}' 不存在。請確認路徑正確。"

# 建立原始訓練資料集（不應用增強）
X, y = create_dataset(swallow_dir, non_dir, n_mfcc=40, hop_length=64, fixed_length=160, sr=22050, duration=0.5, augment=False)
print(f"原始資料集大小: {X.shape}, 標籤大小: {y.shape}")

# 建立增強後的訓練資料集
X_aug, y_aug = create_dataset(swallow_dir, non_dir, n_mfcc=40, hop_length=64, fixed_length=160, sr=22050, duration=0.5, augment=True)
print(f"增強後資料集大小: {X_aug.shape}, 標籤大小: {y_aug.shape}")

# 合併原始資料集與增強後的資料集
X_combined = np.concatenate((X, X_aug), axis=0)
y_combined = np.concatenate((y, y_aug), axis=0)
print(f"合併後資料集大小: {X_combined.shape}, 標籤大小: {y_combined.shape}")


In [None]:
# 標準化
scaler = StandardScaler()
X_reshaped = X_combined.reshape(X_combined.shape[0], -1)  # (num_samples, 40*160=6400)
X_scaled = scaler.fit_transform(X_reshaped)  # (num_samples, 6400)

# 保存 scaler 以便測試階段使用
joblib.dump(scaler, 'scaler/mfcc_scaler.pkl')
print("Scaler 已保存為 'scaler.pkl'")

# 重塑為 (num_samples, 40, 160, 1)
X_scaled = X_scaled.reshape(X_combined.shape[0], 40, 160, 1)

# 複製通道以符合模型要求 (num_samples, 40, 160, 3)
X_scaled = np.repeat(X_scaled, 3, axis=-1)
print(f"重塑後資料集大小: {X_scaled.shape}, 標籤大小: {y_combined.shape}")

# 拆分訓練集和驗證集
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y_combined, test_size=0.2, random_state=42, stratify=y_combined
)
print(f"訓練集大小: {X_train.shape}, 驗證集大小: {X_val.shape}")


In [None]:
def build_complete_regularized_cnn_lstm(input_shape):
    inputs = layers.Input(shape=input_shape)

    # 第一個卷積塊
    x = layers.Conv2D(64, (3,3), activation='relu', padding='same',
                      kernel_regularizer=regularizers.l2(0.001))(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D((2,2))(x)  # (20, 80, 32)
    x = layers.Dropout(0.25)(x)

    # 第二個卷積塊
    x = layers.Conv2D(128, (3,3), activation='relu', padding='same',
                      kernel_regularizer=regularizers.l2(0.001))(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D((2,2))(x)  # (10, 40, 64)
    x = layers.Dropout(0.25)(x)

    # 第三個卷積塊
    x = layers.Conv2D(256, (3,3), activation='relu', padding='same',
                      kernel_regularizer=regularizers.l2(0.001))(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D((2,2))(x)  # (5, 20, 128)
    x = layers.Dropout(0.25)(x)

    # Flatten 和 Dense
    x = layers.Flatten()(x)  # (5*20*128, ) = (12800, )
    x = layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.5)(x)

    # 為 LSTM 準備序列格式
    x = layers.Reshape((1, 256))(x)  # (1, 256)
    x = layers.LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.2,
                   kernel_regularizer=regularizers.l2(0.001))(x)

    # 輸出層
    outputs = layers.Dense(1, activation='sigmoid')(x)

    model = models.Model(inputs, outputs)
    return model

# 定義模型輸入形狀
input_shape = (40, 160, 3)

# 建立模型
model = build_complete_regularized_cnn_lstm(input_shape)

# 編譯模型
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 顯示模型摘要
model.summary()


In [None]:
# 定義 Early Stopping 回調
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

# 訓練模型
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_data=(X_val, y_val),
    # callbacks=[early_stopping]
)
# 繪製訓練與驗證的準確率
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train_accuracy')
plt.plot(history.history['val_accuracy'], label='Val_accuracy')
plt.legend()
plt.title('accuracy')

# 繪製訓練與驗證的損失
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train_loss')
plt.plot(history.history['val_loss'], label='Val_loss')
plt.legend()
plt.title('loss')
plt.show()

In [None]:
# 在驗證集上進行預測
y_pred_prob = model.predict(X_val)
y_pred = (y_pred_prob > 0.9).astype(int)

# 混淆矩陣
cm_best = confusion_matrix(y_val, y_pred)
print("混淆矩陣:")
print(cm_best)

# 視覺化混淆矩陣
plt.figure(figsize=(6, 4))
sns.heatmap(cm_best, annot=True, fmt='d', cmap='Greens', xticklabels=['Non-Swallow', 'Swallow'], yticklabels=['Non-Swallow', 'Swallow'])
plt.xlabel('Predict')
plt.ylabel('GroundTruth')
plt.title('confusion_matrix')
plt.show()

# 分類報告
cr_best = classification_report(y_val, y_pred, target_names=['Non-Swallow', 'Swallow'])
print("分類報告:")
print(cr_best)


In [None]:
# 保存模型
model.save('model/swallow_model.h5')
print("模型已保存為 'swallow_model.h5'")

# 保存標準化器
joblib.dump(scaler, 'scaler/scaler.pkl')
print("Scaler 已保存為 'scaler.pkl'")


In [None]:
def predict_swallow_count(file_path, model, scaler, n_mfcc=40, hop_length=64, fixed_length=160, sr=22050, duration=0.5, overlap=0.25, threshold=0.9):
    """
    預測音訊文件中的吞嚥聲數量，並防止在接下來的 0.5 秒內重複計數。

    參數:
    - file_path (str): 音訊文件路徑
    - model (tf.keras.Model): 訓練好的模型
    - scaler (StandardScaler): 訓練階段使用的標準化器
    - n_mfcc (int): MFCC 的數量
    - hop_length (int): 每次窗口移動的樣本數
    - fixed_length (int): 固定的時間步數（橫向維度）
    - sr (int): 採樣率
    - duration (float): 每個片段的時長（秒）
    - overlap (float): 每個片段之間的重疊時間（秒）
    - threshold (float): 預測閾值

    返回:
    - count (int): 檢測到的吞嚥聲數量
    """
    try:
        y, sr = librosa.load(file_path, sr=sr)
        
        # 定義片段長度和步幅
        segment_length = int(sr * duration)  # 0.5 秒
        step = int(sr * (duration - overlap))  # 0.25 秒
        
        count = 0
        last_detected = -np.inf  # 最後檢測到吞嚥聲的時間
        
        for start in range(0, len(y) - segment_length + 1, step):
            end = start + segment_length
            segment = y[start:end]
            
            # 提取 MFCC
            mfcc = extract_mfcc_from_data(segment, sr, n_mfcc=n_mfcc, hop_length=hop_length, fixed_length=fixed_length)
            if mfcc is None:
                continue
            
            # 標準化
            mfcc_reshaped = mfcc.reshape(1, -1)
            mfcc_scaled = scaler.transform(mfcc_reshaped)
            mfcc_scaled = mfcc_scaled.reshape(1, n_mfcc, fixed_length, 1)
            mfcc_scaled = np.repeat(mfcc_scaled, 3, axis=-1)  # (1, 40, 160, 3)
            
            # 預測
            prediction = model.predict(mfcc_scaled)[0][0]
            current_time = start / sr  # 當前片段的開始時間（秒）
            
            if prediction > threshold:
                # 檢查是否超過防止重複檢測的時間間隔
                if current_time - last_detected >= duration:
                    print(f"Swallow Detected conf{prediction}")
                    count += 1
                    last_detected = current_time
        return count
    
    except Exception as e:
        print(f"處理文件 {file_path} 時出現錯誤: {e}")
        return 0


In [None]:
# 載入已保存的 scaler
scaler = joblib.load('scaler/scaler.pkl')
print("Scaler 已載入")

# 載入已保存的模型
model = tf.keras.models.load_model('model/swallow_model.h5')
print("模型已載入")

# 定義要測試的音訊文件路徑
test_file_path = '音檔\測試RSST.wav'  # 替換為實際路徑，注意使用正斜杠或原始字符串

# 確認音訊文件存在
if not os.path.exists(test_file_path):
    print(f"音訊文件 '{test_file_path}' 不存在。請確認路徑正確。")
else:
    # 預測吞嚥聲數量
    num_swallow = predict_swallow_count(
        file_path=test_file_path,
        model=model,
        scaler=scaler,
        n_mfcc=40,
        hop_length=64,
        fixed_length=160,
        sr=22050,
        duration=0.5,
        overlap=0.25,
        threshold=0.9  # 可以根據 ROC 曲線調整
    )
    print(f"音訊文件 '{test_file_path}' 中檢測到的吞嚥聲數量: {num_swallow}")

    # # 繪製每個片段的預測概率
    # plt.figure(figsize=(12, 4))
    # plt.plot(predictions, marker='o', linestyle='-', label='預測概率')
    # plt.axhline(y=0.5, color='r', linestyle='--', label='閾值 (0.5)')
    # plt.xlabel('片段編號')
    # plt.ylabel('預測概率')
    # plt.title('每個片段的預測概率')
    # plt.legend()
    # plt.show()

    # # 打印高於閾值的片段索引
    # high_prob_indices = [i for i, prob in enumerate(predictions) if prob > 0.5]
    # print(f"高於閾值的片段索引: {high_prob_indices}")

    # # 載入音訊數據以進行可視化
    # y_test, sr_test = librosa.load(test_file_path, sr=22050)
    # for idx in high_prob_indices:
    #     start_time = idx * (0.5 - 0.25)
    #     end_time = start_time + 0.5
    #     start_sample = int(sr_test * start_time)
    #     end_sample = int(sr_test * end_time)
    #     y_segment = y_test[start_sample:end_sample]

    #     plt.figure(figsize=(10, 2))
    #     librosa.display.waveshow(y_segment, sr=sr_test)
    #     plt.title(f'片段 {idx} 預測概率: {predictions[idx]:.2f}')
    #     plt.xlabel('時間 (秒)')
    #     plt.ylabel('振幅')
    #     plt.show()
