In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

# 데이터셋 클래스 정의
class AudioFolderDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.file_paths = []
        self.labels = []
        self.label_map = {}

        self._load_dataset()
    def _load_dataset(self):
        label_names = os.listdir(self.root_dir)
        for idx, label_name in enumerate(label_names):
            label_dir = os.path.join(self.root_dir, label_name)
            if os.path.isdir(label_dir):
                self.label_map[label_name] = idx
                for file_name in os.listdir(label_dir):
                    if file_name.endswith('.wav'):
                        self.file_paths.append(os.path.join(label_dir, file_name))
                        self.labels.append(idx)

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]
        waveform, sample_rate = torchaudio.load(file_path)
        if self.transform:
            waveform = self.transform(waveform)
        return waveform, label

# 전처리 함수 정의
def preprocess(waveform, new_sample_rate=8000):
    transform = torchaudio.transforms.Resample(orig_freq=waveform.shape[1], new_freq=new_sample_rate)
    return transform(waveform)

# 모델 정의
class CNNClassifier(nn.Module):
    def __init__(self, input_shape, num_label):
        super(CNNClassifier, self).__init__()
        self.norm_layer = nn.BatchNorm2d(1)
        self.resizing = nn.Upsample(size=(32, 32), mode='bilinear', align_corners=False)
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.dropout1 = nn.Dropout(0.25)
        self.flatten = nn.Flatten()

        # CNN 레이어 이후 출력 크기 계산
        with torch.no_grad():
            sample_input = torch.zeros(1, *input_shape).unsqueeze(0)
            sample_output = self._forward_features(sample_input)
            flatten_size = sample_output.numel()

        self.fc1 = nn.Linear(flatten_size, 128)
        self.dropout2 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, num_label)

    def _forward_features(self, x):
        x = self.resizing(x)
        x = self.norm_layer(x)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = self.dropout1(x)
        return x

    def forward(self, x):
        x = self._forward_features(x)
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = self.dropout2(x)
        x = self.fc2(x)
        return x

# 데이터 준비
data_dir = '/content/drive/MyDrive/살길찾기/rebalanced_dataset/'
dataset = AudioFolderDataset(data_dir, transform=preprocess)

train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 모델 초기화
input_shape = (1, 32, 32)  # 입력 크기를 (채널, 높이, 너비)로 설정
num_label = len(dataset.label_map)
model = CNNClassifier(input_shape, num_label)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 훈련 루프
num_epochs = 20
best_val_accuracy = 0.0
best_model_path = 'best_model.pth'
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs = inputs.unsqueeze(1)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}')

    # 평가
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.unsqueeze(1)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    val_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Validation Accuracy: {val_accuracy}')

    # 최적 모델 저장
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), best_model_path)
        print(f'Saved Best Model with Accuracy: {val_accuracy}')

accuracy = accuracy_score(all_labels, all_preds)
print(f'Accuracy: {accuracy}')



ValueError: Input and output must have the same number of spatial dimensions, but got input with spatial dimensions of [1, 32, 32] and output size of (32, 32). Please provide input tensor in (N, C, d1, d2, ...,dK) format and output size in (o1, o2, ...,oK) format.

In [None]:
# 모델 평가
model.load_state_dict(torch.load(best_model_path))
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.unsqueeze(1)  # (batch_size, 1, 32, 32) 형태로 변환
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Accuracy: {accuracy}')

일단 이코드로

In [4]:
import os
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

# 폴더 구조를 기반으로 데이터셋 클래스를 정의
class AudioFolderDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.file_paths = []
        self.labels = []
        self.label_map = {}

        self._load_dataset()

    def _load_dataset(self):
        label_names = os.listdir(self.root_dir)
        for idx, label_name in enumerate(label_names):
            label_dir = os.path.join(self.root_dir, label_name)
            if os.path.isdir(label_dir):
                self.label_map[label_name] = idx
                for file_name in os.listdir(label_dir):
                    if file_name.endswith('.wav'):
                        self.file_paths.append(os.path.join(label_dir, file_name))
                        self.labels.append(idx)

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]
        waveform, sample_rate = torchaudio.load(file_path)
        if self.transform:
            waveform = self.transform(waveform)
        return waveform, label

# 전처리 함수 정의
def preprocess(waveform, new_sample_rate=8000):
    transform = torchaudio.transforms.Resample(orig_freq=waveform.shape[1], new_freq=new_sample_rate)
    return transform(waveform)

# CNN 모델 정의
class CNNClassifier(nn.Module):
    def __init__(self, input_shape, num_label):
        super(CNNClassifier, self).__init__()
        self.norm_layer = nn.BatchNorm2d(1)
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.dropout1 = nn.Dropout(0.25)
        self.flatten = nn.Flatten()

        # CNN 레이어 이후 출력 크기 계산
        with torch.no_grad():
            sample_input = torch.zeros(1, *input_shape).unsqueeze(0)
            sample_output = self._forward_features(sample_input)
            flatten_size = sample_output.numel()

        self.fc1 = nn.Linear(flatten_size, 128)
        self.dropout2 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, num_label)

    def _forward_features(self, x):
        x = F.interpolate(x, size=(32, 32), mode='bilinear', align_corners=False)
        x = self.norm_layer(x)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = self.dropout1(x)
        return x

    def forward(self, x):
        x = self._forward_features(x)
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = self.dropout2(x)
        x = self.fc2(x)
        return x

# 데이터 준비
data_dir = '/content/drive/MyDrive/살길찾기/rebalanced_dataset/'  # 데이터 폴더 경로
dataset = AudioFolderDataset(data_dir, transform=preprocess)

train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 모델 초기화
input_shape = (1, 8000)  # 입력 크기를 (채널, 길이)로 설정
num_label = len(dataset.label_map)
model = CNNClassifier(input_shape, num_label)

# 손실 함수 및 옵티마이저 정의
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 모델 훈련 및 검증
num_epochs = 50
best_val_accuracy = 0.0
best_model_path = '/content/drive/MyDrive/살길찾기/model/model_1.pth'

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs = inputs.unsqueeze(1)  # (batch_size, 1, 8000) 형태로 변환
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}')

    # 검증 루프
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.unsqueeze(1)  # (batch_size, 1, 8000) 형태로 변환
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    val_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Validation Accuracy: {val_accuracy}')

    # 최적 모델 저장
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), best_model_path)
        print(f'Saved Best Model with Accuracy: {val_accuracy}')

Epoch 1, Loss: 1.60326968299018
Validation Accuracy: 0.39285714285714285
Saved Best Model with Accuracy: 0.39285714285714285
Epoch 2, Loss: 1.3726365168889363
Validation Accuracy: 0.39285714285714285
Epoch 3, Loss: 1.2301290300157335
Validation Accuracy: 0.39285714285714285
Epoch 4, Loss: 1.0946548117531671
Validation Accuracy: 0.35714285714285715
Epoch 5, Loss: 1.029762367407481
Validation Accuracy: 0.42857142857142855
Saved Best Model with Accuracy: 0.42857142857142855
Epoch 6, Loss: 0.8639911744329665
Validation Accuracy: 0.42857142857142855
Epoch 7, Loss: 0.8250906334982978
Validation Accuracy: 0.6071428571428571
Saved Best Model with Accuracy: 0.6071428571428571
Epoch 8, Loss: 0.7488950888315836
Validation Accuracy: 0.5
Epoch 9, Loss: 0.6382901966571808
Validation Accuracy: 0.6071428571428571
Epoch 10, Loss: 0.6185900304052565
Validation Accuracy: 0.5714285714285714
Epoch 11, Loss: 0.46140480869346195
Validation Accuracy: 0.5714285714285714
Epoch 12, Loss: 0.4423432730966144
Valid

In [5]:
best_model_path = '/content/drive/MyDrive/살길찾기/model/model_1.pth'
# 모델 평가
model.load_state_dict(torch.load(best_model_path))
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.unsqueeze(1)  # (batch_size, 1, 8000) 형태로 변환
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.6896551724137931


아래는 일단 패스

In [None]:
import os
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

# 데이터셋 클래스 정의
class AudioFolderDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.file_paths = []
        self.labels = []
        self.label_map = {}

        self._load_dataset()

    def _load_dataset(self):
        label_names = os.listdir(self.root_dir)
        for idx, label_name in enumerate(label_names):
            label_dir = os.path.join(self.root_dir, label_name)
            if os.path.isdir(label_dir):
                self.label_map[label_name] = idx
                for file_name in os.listdir(label_dir):
                    if file_name.endswith('.wav'):
                        self.file_paths.append(os.path.join(label_dir, file_name))
                        self.labels.append(idx)

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]
        waveform, sample_rate = torchaudio.load(file_path)
        if self.transform:
            waveform = self.transform(waveform)
        return waveform, label

# 전처리 함수 정의
def preprocess(waveform, new_sample_rate=8000):
    transform = torchaudio.transforms.Resample(orig_freq=waveform.shape[1], new_freq=new_sample_rate)
    return transform(waveform)

def mel_spectrogram(waveform, sample_rate=8000, n_mels=64, n_fft=1024, hop_length=512):
    mel_spec_transform = torchaudio.transforms.MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels
    )
    return mel_spec_transform(waveform)

# CNN 모델 정의
class CNNClassifier(nn.Module):
    def __init__(self, input_shape, num_classes):
        super(CNNClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)

        # Conv 레이어를 거친 후의 출력 크기를 계산하여 Flatten 레이어의 입력 크기로 설정
        self.flatten_size = self._get_flatten_size(input_shape)

        # self.fc1 = nn.Linear(self.flatten_size, 256)
        self.fc1 = nn.Linear(2048, 256)  # fc1의 입력 크기를 15360으로 수정
        self.fc2 = nn.Linear(256, num_classes)

    def _get_flatten_size(self, input_shape):
        with torch.no_grad():
            x = torch.zeros(1, *input_shape)
            x = self._forward_features(x)
            return x.view(1, -1).size(1)

    def _forward_features(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.dropout1(x)
        return x

    def forward(self, x):
        x = self._forward_features(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = F.relu(self.fc1(x))
        x = self.dropout2(x)
        x = self.fc2(x)
        return x

# 데이터 준비
# data_dir = '/content/drive/MyDrive/살길찾기/donateacry_corpus'
data_dir = '/content/drive/MyDrive/살길찾기/rebalanced_dataset/'  # 데이터 폴더 경로
dataset = AudioFolderDataset(data_dir, transform=preprocess)

# 데이터셋을 멜-스펙트로그램으로 변환
class MelSpectrogramDataset(Dataset):
    def __init__(self, dataset, sample_rate=8000):
        self.dataset = dataset
        self.sample_rate = sample_rate

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        waveform, label = self.dataset[idx]
        mel_spec = mel_spectrogram(waveform, sample_rate=self.sample_rate)
        mel_spec = mel_spec.squeeze(0).unsqueeze(0)  # (1, n_mels, time) 형태로 변환
        return mel_spec, label

mel_dataset = MelSpectrogramDataset(dataset)

train_size = int(0.7 * len(mel_dataset))
val_size = int(0.15 * len(mel_dataset))
test_size = len(mel_dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(mel_dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 모델 초기화
input_shape = (1, 64, 126)  # (채널, n_mels, time)으로 설정
num_classes = len(dataset.label_map)
model = CNNClassifier(input_shape, num_classes)

# 손실 함수 및 옵티마이저 정의
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 모델 훈련 및 검증
num_epochs = 1000
best_val_accuracy = 0.0
best_model_path = '/content/drive/MyDrive/살길찾기/model/best_rebalanced_model_1.pth'

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}')

    # 검증 루프
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    val_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Validation Accuracy: {val_accuracy}')

    # 최적 모델 저장
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), best_model_path)
        print(f'Saved Best Model with Accuracy: {val_accuracy}')



In [None]:
# 모델 평가
model.load_state_dict(torch.load(best_model_path))
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Accuracy: {accuracy}')