In [1]:
%conda env list

# conda environments:
#
a103                     /home/j-i13a103/.conda/envs/a103
llm                      /home/j-i13a103/.conda/envs/llm
stt                      /home/j-i13a103/.conda/envs/stt
tts                      /home/j-i13a103/.conda/envs/tts
wakeword                 /home/j-i13a103/.conda/envs/wakeword
base                     /opt/tljh/user


Note: you may need to restart the kernel to use updated packages.


In [2]:
import sys

print(sys.executable)
print(sys.version)

/home/j-i13a103/.conda/envs/wakeword/bin/python
3.9.23 | packaged by conda-forge | (main, Jun  4 2025, 17:57:12) 
[GCC 13.3.0]


In [3]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [4]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
Tesla V100-PCIE-32GB


In [None]:
# 필요한 라이브러리 로드

from tqdm import tqdm
from natsort import natsorted
from IPython.display import Audio
import copy
import random
import cv2
import matplotlib.pyplot as plt

import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from sklearn.metrics import accuracy_score
import wave
import shutil

In [6]:
if torch.cuda.is_available() == True:
    device = "cuda:0"
    print("현재 가상환경 cuda 설정 가능")

else:
    device = "cpu"
    print("현재 가상환경 cpu 사용")

현재 가상환경 cuda 설정 가능


In [None]:
# LSTM 웨이크워드 인식 모델 구조
import torch
import torch.nn as nn

class WakeWordLSTM(nn.Module):
    def __init__(self, input_dim=40, hidden_dim=128, num_layers=4, dropout=0.3):
        super(WakeWordLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_dim, hidden_dim, num_layers=num_layers,
            batch_first=True, dropout=dropout, bidirectional=True
        )
        self.fc = nn.Linear(hidden_dim * 2, 1)    # bidirectional 이기 때문에 *2
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x: [batch_size, time, features] = [B, T, 40]
        out, _ = self.lstm(x)    # out: [B, T, H*2]
        out = out[:, -1, :]    # 마지막 timestep 사용
        out = self.fc(out)
        return self.sigmoid(out).squeeze(1)

In [8]:
model = WakeWordLSTM()
sample_input = torch.randn(16, 80, 40)
output = model(sample_input)
print(output.shape)

torch.Size([16])


In [9]:
output

tensor([0.5112, 0.5112, 0.5118, 0.5105, 0.5108, 0.5118, 0.5116, 0.5104, 0.5112,
        0.5118, 0.5115, 0.5113, 0.5119, 0.5103, 0.5110, 0.5119],
       grad_fn=<SqueezeBackward1>)

### 모델 학습을 위한 데이터 전처리

In [10]:
import librosa
import numpy as np

# mel-spectrogram 변환 함수
def wav_to_mel(path, sr=16000, n_mels=40):
    y, _ = librosa.load(path, sr=sr)
    mel = librosa.feature.melspectrogram(
        y=y, sr=sr, n_mels=n_mels, n_fft=512, hop_length=160, win_length=400
    )
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_db = mel_db.T  # [time, n_mels]
    return mel_db

In [11]:
# pytorch Dataset 클래스
import os
import torch
from torch.utils.data import Dataset

class WakeWordDataset(Dataset):
    def __init__(self, root_dir, max_len=80):
        self.data = []
        self.max_len = max_len
        for label, subfolder in enumerate(['negative', 'positive']):
            folder = os.path.join(root_dir, subfolder)
            for fname in os.listdir(folder):
                if fname.endswith(".wav"):
                    path = os.path.join(folder, fname)
                    self.data.append((path, label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        path, label = self.data[idx]
        mel = wav_to_mel(path)  # [T, 40]
        # 길이 맞추기
        if mel.shape[0] > self.max_len:
            mel = mel[:self.max_len]
        else:
            pad = self.max_len - mel.shape[0]
            mel = np.pad(mel, ((0, pad), (0, 0)), mode='constant')

        mel_tensor = torch.tensor(mel, dtype=torch.float32)
        label_tensor = torch.tensor(label, dtype=torch.float32)
        return mel_tensor, label_tensor


In [None]:
from torch.utils.data import DataLoader
import torch.nn as nn
import torch

# 학습 코드
def train(model, dataset_path, epochs=3, batch_size=16):
    dataset = WakeWordDataset(dataset_path)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.BCELoss()

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for x, y in loader:
            pred = model(x)
            loss = criterion(pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} Loss: {total_loss:.4f}")

    torch.save(model.state_dict(), "wakeword_finetuned.pth")
