<a href="https://colab.research.google.com/github/minjeon99/X10/blob/main/yawn_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import torch
import zipfile
import os
import json
import librosa
from IPython.display import Audio

# 하품 소리 감지 모델

## 데이터 준비

In [None]:
def unzip_file(zip_path, output_dir):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(output_dir)

base_path = "/content/drive/MyDrive/start-10/비언어적 소리 데이터"
for split in ["Train", "Validation"]:
    for folder in ["source", "label"]:
        zip_path = os.path.join(base_path, split, folder, "사람.zip")
        output_dir = os.path.join('/content', split, folder)
        unzip_file(zip_path, output_dir)

In [None]:
base_path = '/content/Train/label/1.╗²╕«╟÷╗≤'
os.rename(base_path, '/content/Train/label/생리현상')
base_path = '/content/Train/source/1.╗²╕«╟÷╗≤'
os.rename(base_path, '/content/Train/source/생리현상')

base_paths = ['/content/Train/label/생리현상', '/content/Train/source/생리현상']
new_names = ['기침 소리', '하품 소리', '트림 소리', '헛기침 소리', '코고는 소리', '재채기 소리', '방귀 소리' ,'거친 호흡소리']

for base_path in base_paths:
    folders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))]
    for i, folder in enumerate(folders):
        if i < len(new_names):
            old_folder_path = os.path.join(base_path, folder)
            new_folder_path = os.path.join(base_path, new_names[i])
            os.rename(old_folder_path, new_folder_path)

In [None]:
base_path = '/content/Validation/label/1.╗²╕«╟÷╗≤'
os.rename(base_path, '/content/Validation/label/생리현상')
base_path = '/content/Validation/source/1.╗²╕«╟÷╗≤'
os.rename(base_path, '/content/Validation/source/생리현상')

base_paths = ['/content/Validation/label/생리현상', '/content/Validation/source/생리현상']
new_names = ['기침 소리', '하품 소리', '트림 소리', '헛기침 소리', '코고는 소리', '재채기 소리', '방귀 소리' ,'거친 호흡소리']

for base_path in base_paths:
    folders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))]
    for i, folder in enumerate(folders):
        if i < len(new_names):
            old_folder_path = os.path.join(base_path, folder)
            new_folder_path = os.path.join(base_path, new_names[i])
            os.rename(old_folder_path, new_folder_path)

In [None]:
pd.read_json('/content/Train/label/생리현상/하품 소리/S-211014_H_106_C_044_0001.json')

Unnamed: 0,RawDataInfo,SourceDataInfo,LabelDataInfo
RawDataId,S-211014_H_106_C_044,,
Copyrighter,㈜미디어그룹사람과숲,,
SampleRate(Hz),44100,,
Channel,1,,
BitDepth(bit),16,,
RecordingDevice,Android,,
BitRate(kbps),128,,
CollectionType,C,,
RecDateTime,2021-10-14 23:11:23,,
RecDataLength(sec),4,,


1. 라벨 정보 (LabelDataInfo):
- LabelID: 파일 ID
- Class: 소리의 종류 (예: "01" → 기침 소리)
- Segmentations: 소리 구간 (예: [[10.16, 11.67], [12.24, 14.06], [15.09, 16.43]])
2. 원천 데이터 (SourceDataInfo):
- SourceDataId: 원천 파일 이름
- Path: 파일 경로

In [None]:
import os
import json

def parse_individual_jsons(base_dir, label_dir_name='label', source_dir_name='source'):
    """
    폴더 내 모든 JSON 파일을 탐색

    - base_dir: Train/Validation directory
    - label_dir_name: label directory 이름
    - ource_dir_name: source directory 이름
    - return(data_entries): label, source path 등 포함한 list
    """
    label_dir = os.path.join(base_dir, label_dir_name, '생리현상')
    source_dir = os.path.join(base_dir, source_dir_name, '생리현상')

    data_entries = []

    for root, _, files in os.walk(label_dir):
        for file in files:
            if file.endswith('.json'):
                json_path = os.path.join(root, file)

                with open(json_path, 'r', encoding='utf-8') as f:
                    label_data = json.load(f)

                audio_class = os.path.basename(root)
                audio_file_name = file.replace('.json', '.mp3')
                source_path = os.path.join(source_dir, audio_class, audio_file_name)

                if not os.path.exists(source_path):
                    print(f"Warning: Source file not found for {json_path}")
                    continue

                data_entries.append({
                    'json_path': json_path,
                    'source_path': source_path,
                    'label_id': label_data.get('LabelDataInfo', {}).get('LabelID', ''),
                    'segmentations': label_data.get('LabelDataInfo', {}).get('Segmentations', []),
                    'class': audio_class,
                    'description': label_data.get('LabelDataInfo', {}).get('Desc', ''),
                })

    return data_entries

In [None]:
import librosa
import soundfile as sf

def extract_audio_clips(data_entries, output_dir, target_sample_rate=16000):
    """
    segment 기반 오디오 클립 추출

    - data_entries: JSON, source data path, segment 포함한 list
    - output_dir: 저장할 디렉토리 경로
    - target_sample_rate: 변환할 sample rate
    """
    os.makedirs(output_dir, exist_ok=True)

    for entry in data_entries:
        audio_path = entry['source_path']
        label_id = entry['label_id']
        segmentations = entry['segmentations']
        audio_class = entry['class']

        try:
            waveform, sr = librosa.load(audio_path, sr=target_sample_rate)

            for i, (start, end) in enumerate(segmentations):
                start_frame = int(start * sr)
                end_frame = int(end * sr)
                clip = waveform[start_frame:end_frame]

                output_path = os.path.join(output_dir, audio_class, f"{label_id}_{i}.wav")
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
                sf.write(output_path, clip, sr)
                print(f"Saved: {output_path}")

        except Exception as e:
            print(f"Error processing {audio_path}: {e}")

In [None]:
train_dir = "Train"
val_dir = "Validation"
output_dir = "processed_clips"

# Train data
train_data_entries = parse_individual_jsons(train_dir)
extract_audio_clips(train_data_entries, os.path.join(output_dir, "Train"))

Saved: processed_clips/Train/하품 소리/S-211101_H_106_C_038_0001_0.wav
Saved: processed_clips/Train/하품 소리/S-211101_H_106_C_038_0001_1.wav
Saved: processed_clips/Train/하품 소리/S-211101_H_106_C_038_0001_2.wav
Saved: processed_clips/Train/하품 소리/S-211031_H_106_C_045_0001_0.wav
Saved: processed_clips/Train/하품 소리/S-211031_H_106_C_045_0001_1.wav
Saved: processed_clips/Train/하품 소리/S-211031_H_106_C_045_0001_2.wav
Saved: processed_clips/Train/하품 소리/S-211104_H_106_C_001_0001_0.wav
Saved: processed_clips/Train/하품 소리/S-211104_H_106_C_001_0001_1.wav
Saved: processed_clips/Train/하품 소리/S-211104_H_106_C_001_0001_2.wav
Saved: processed_clips/Train/하품 소리/S-211030_H_106_C_014_0001_0.wav
Saved: processed_clips/Train/하품 소리/S-211029_H_106_C_008_0001_0.wav
Saved: processed_clips/Train/하품 소리/S-211029_H_106_C_010_0001_0.wav
Saved: processed_clips/Train/하품 소리/S-211029_H_106_C_010_0001_1.wav
Saved: processed_clips/Train/하품 소리/S-211029_H_106_C_010_0001_2.wav
Saved: processed_clips/Train/하품 소리/S-211029_H_106_C_010_0001_3

In [None]:
# Validation data
val_data_entries = parse_individual_jsons(val_dir)
extract_audio_clips(val_data_entries, os.path.join(output_dir, "Validation"))

Saved: processed_clips/Validation/하품 소리/S-210821_H_106_L_008_0001_0.wav
Saved: processed_clips/Validation/하품 소리/S-210822_H_106_L_001_0001_0.wav
Saved: processed_clips/Validation/하품 소리/S-210822_H_106_L_001_0001_1.wav
Saved: processed_clips/Validation/하품 소리/S-210822_H_106_L_001_0001_2.wav
Saved: processed_clips/Validation/하품 소리/S-210821_H_106_L_006_0001_0.wav
Saved: processed_clips/Validation/하품 소리/S-210821_H_106_L_001_0001_0.wav
Saved: processed_clips/Validation/하품 소리/S-211011_H_106_C_042_0001_0.wav
Saved: processed_clips/Validation/하품 소리/S-211011_H_106_C_042_0001_1.wav
Saved: processed_clips/Validation/하품 소리/S-210913_H_106_D_015_0001_0.wav
Saved: processed_clips/Validation/하품 소리/S-210913_H_106_D_015_0001_1.wav
Saved: processed_clips/Validation/하품 소리/S-210821_H_106_L_002_0001_0.wav
Saved: processed_clips/Validation/하품 소리/S-210913_H_106_D_022_0001_0.wav
Saved: processed_clips/Validation/하품 소리/S-210821_H_106_L_003_0001_0.wav
Saved: processed_clips/Validation/하품 소리/S-210821_H_106_L_003_000

In [None]:
# train data check
Audio('/content/processed_clips/Train/하품 소리/S-211016_H_106_C_018_0001_1.wav')

## Train, Validation, Test set
>reference: *TensorFlow - 환경 소리 분류를 위한 YAMNet을 사용한 전이 학습 [(링크)](https://www.tensorflow.org/tutorials/audio/transfer_learning_audio?hl=ko)*

In [None]:
#!pip install tensorflow_io
import os

from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_io as tfio

Collecting tensorflow_io
  Downloading tensorflow_io-0.37.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Downloading tensorflow_io-0.37.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (49.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorflow_io
Successfully installed tensorflow_io-0.37.1


In [None]:
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

In [None]:
# Utility functions for loading audio files and making sure the sample rate is correct.

@tf.function
def load_wav_16k_mono(filename):
    """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio. """
    file_contents = tf.io.read_file(filename)
    wav, sample_rate = tf.audio.decode_wav(
          file_contents,
          desired_channels=1)
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

In [None]:
from sklearn.model_selection import train_test_split

# Train set
my_class = ['하품 소리', '거친 호흡소리', '기침 소리', '방귀 소리', '재채기 소리', '코고는 소리', '트림 소리', '하품 소리', '헛기침 소리']  # 레이블 정의
data_dir = '/content/processed_clips/Train'
file_paths = []
y_labels = []

for label in my_class:
    label_dir = os.path.join(data_dir, label)
    for filename in os.listdir(label_dir):
        if filename.endswith('.wav'):
            file_paths.append(os.path.join(label_dir, filename))
            y_labels.append(1 if label == '하품 소리' else 0)

# Test set
data_dir = '/content/processed_clips/Validation'
test_file_paths = []
test_y_labels = []

for label in my_class:
    label_dir = os.path.join(data_dir, label)
    for filename in os.listdir(label_dir):
        if filename.endswith('.wav'):
            test_file_paths.append(os.path.join(label_dir, filename))
            test_y_labels.append(1 if label == '하품 소리' else 0)

In [None]:
main_ds = tf.data.Dataset.from_tensor_slices((file_paths, y_labels))
test_ds = tf.data.Dataset.from_tensor_slices((test_file_paths, test_y_labels))

In [None]:
def load_wav_for_map(filename, label):
  return load_wav_16k_mono(filename), label

main_ds = main_ds.map(load_wav_for_map)
test_ds = test_ds.map(load_wav_for_map)

In [None]:
# applies the embedding extraction model to a wav data
def extract_embedding(wav_data, label):
  ''' run YAMNet to extract embedding from the wav data '''
  scores, embeddings, spectrogram = yamnet_model(wav_data)
  num_embeddings = tf.shape(embeddings)[0]
  return (embeddings, tf.repeat(label, num_embeddings))

# extract embedding
main_ds = main_ds.map(extract_embedding).unbatch()
test_ds = main_ds.map(extract_embedding).unbatch()

In [None]:
all_data = list(main_ds.as_numpy_iterator())
data, labels = zip(*all_data)

test_data = list(test_ds.as_numpy_iterator())
test_X, test_y = zip(*test_data)

In [None]:
# Train, Validation split
from sklearn.model_selection import train_test_split
train_data, val_data, train_labels, val_labels = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# TensorFlow Dataset
train_ds = tf.data.Dataset.from_tensor_slices((train_data, train_labels))
val_ds = tf.data.Dataset.from_tensor_slices((val_data, val_labels))
test_ds = tf.data.Dataset.from_tensor_slices((list(test_X), list(test_y)))

train_ds = train_ds.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)

print(train_ds.element_spec)
print(val_ds.element_spec)
print(test_ds.element_spec)

(TensorSpec(shape=(None, 1024), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))
(TensorSpec(shape=(None, 1024), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))
(TensorSpec(shape=(None, 1024), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))


In [None]:
my_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024,), dtype=tf.float32,
                          name='input_embedding'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(2)
], name='my_model')

my_model.summary()

In [None]:
my_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 optimizer="adam",
                 metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                            patience=3,
                                            restore_best_weights=True)

In [None]:
history = my_model.fit(train_ds, epochs=10, validation_data=val_ds, callbacks=callback)

Epoch 1/10
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.8831 - loss: 0.3058 - val_accuracy: 0.9180 - val_loss: 0.2214
Epoch 2/10
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.9293 - loss: 0.1862 - val_accuracy: 0.9333 - val_loss: 0.1733
Epoch 3/10
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 0.9380 - loss: 0.1638 - val_accuracy: 0.9248 - val_loss: 0.1866
Epoch 4/10
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.9456 - loss: 0.1475 - val_accuracy: 0.9452 - val_loss: 0.1511
Epoch 5/10
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.9516 - loss: 0.1326 - val_accuracy: 0.9425 - val_loss: 0.1388
Epoch 6/10
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.9589 - loss: 0.1075 - val_accuracy: 0.9480 - val_loss: 0.1308
Epoch 7/10
[1m368/368[

In [None]:
loss, accuracy = my_model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

[1m460/460[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7519 - loss: 1.8345
Loss:  0.9637901782989502
Accuracy:  0.8697959184646606


# Sample

In [None]:
class ReduceMeanLayer(tf.keras.layers.Layer):
  def __init__(self, axis=0, **kwargs):
    super(ReduceMeanLayer, self).__init__(**kwargs)
    self.axis = axis

  def call(self, input):
    return tf.math.reduce_mean(input, axis=self.axis)

- 하품 소리

In [None]:
sample_path = '/content/processed_clips/Validation/하품 소리/S-211011_H_106_C_042_0001_0.wav'
Audio(sample_path)

In [None]:
my_classes = ['그 외', '하품 소리']

sample_wav = load_wav_16k_mono(sample_path)
scores, embeddings, spectrogram = yamnet_model(sample_wav)
sample_pred = my_model(embeddings)
serving_outputs = ReduceMeanLayer(axis=0, name='classifier')(sample_pred)
sample_prob = tf.nn.softmax(serving_outputs)
print(f"예측된 클래스: {my_classes[sample_prob.numpy().argmax()]}, 확률: {sample_prob[sample_prob.numpy().argmax()]:.4f}")

예측된 클래스: 하품 소리, 확률: 0.8017


- 기타 소리

In [None]:
sample_path = '/content/processed_clips/Validation/거친 호흡소리/S-210928_H_108_C_001_0001_0.wav'
Audio(sample_path)

In [None]:
my_classes = ['그 외', '하품 소리']

sample_wav = load_wav_16k_mono(sample_path)
scores, embeddings, spectrogram = yamnet_model(sample_wav)
sample_pred = my_model(embeddings)
serving_outputs = ReduceMeanLayer(axis=0, name='classifier')(sample_pred)
sample_prob = tf.nn.softmax(serving_outputs)
print(f"예측된 클래스: {my_classes[sample_prob.numpy().argmax()]}, 확률: {sample_prob[sample_prob.numpy().argmax()]:.4f}")

예측된 클래스: 그 외, 확률: 0.9998
