<a href="https://colab.research.google.com/github/ShogoNoguchi/Comparison_Using-TPU-and-GPU-in-Audio-Multi-Class-Classification/blob/main/%E9%9F%B3%E5%A3%B0%E5%88%86%E9%A1%9EonGPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import torchaudio
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

# 必要なディレクトリを作成
data_dir = "data"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# ターゲットラベルの定義
target_labels = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
unknown_label = 'unknown'
unique_labels = target_labels + [unknown_label]
label_map = {label: idx for idx, label in enumerate(unique_labels)}

# データセットの読み込み（公式の分割を使用）
train_dataset = torchaudio.datasets.SPEECHCOMMANDS(root=data_dir, subset='training', download=True)
validation_dataset = torchaudio.datasets.SPEECHCOMMANDS(root=data_dir, subset='validation', download=True)
test_dataset = torchaudio.datasets.SPEECHCOMMANDS(root=data_dir, subset='testing', download=True)

class SpeechCommandsDataset(Dataset):
    def __init__(self, dataset, sample_rate=16000, n_mfcc=40, max_length=16000, label_map=None):
        self.dataset = dataset
        self.sample_rate = sample_rate
        self.n_mfcc = n_mfcc
        self.max_length = max_length
        self.label_map = label_map

        # 前処理トランスフォーム
        self.resample_transform = torchaudio.transforms.Resample(orig_freq=16000, new_freq=sample_rate)
        self.mfcc_transform = torchaudio.transforms.MFCC(
            sample_rate=sample_rate,
            n_mfcc=n_mfcc,
            melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": n_mfcc},
        )

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # 元データを取得
        waveform, original_sample_rate, label, _, _ = self.dataset[idx]

        # ターゲットラベル以外を"unknown"にマッピング
        label = label if label in target_labels else unknown_label

        # リサンプリング
        if original_sample_rate != self.sample_rate:
            waveform = self.resample_transform(waveform)

        # 標準化
        waveform = (waveform - waveform.mean()) / waveform.std()

        # 長さ調整
        if waveform.size(1) < self.max_length:
            padding = self.max_length - waveform.size(1)
            waveform = torch.nn.functional.pad(waveform, (0, padding))
        else:
            waveform = waveform[:, :self.max_length]

        # MFCCに変換
        mfcc = self.mfcc_transform(waveform)

        # ラベルを数値化
        label_id = self.label_map[label] if self.label_map else label

        return mfcc, label_id

class SpeechCommandClassifier(nn.Module):
    def __init__(self, n_mfcc=40, num_classes=len(label_map)):
        super(SpeechCommandClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)

        # サンプル入力を使用してフラット化後のサイズを計算
        with torch.no_grad():
            sample_input = torch.zeros(1, 1, n_mfcc, 101)
            out = self.pool(F.relu(self.conv1(sample_input)))
            out = self.pool(F.relu(self.conv2(out)))
            flattened_size = out.view(-1).shape[0]

        self.fc1 = nn.Linear(flattened_size, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)  # フラット化
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# データセットのインスタンスを作成
train_data = SpeechCommandsDataset(train_dataset, label_map=label_map)
validation_data = SpeechCommandsDataset(validation_dataset, label_map=label_map)
test_data = SpeechCommandsDataset(test_dataset, label_map=label_map)

# データローダーの作成
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
validation_loader = DataLoader(validation_data, batch_size=64, shuffle=False)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

# デバイスの設定
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# モデルのインスタンスを作成し、デバイスに移行
model = SpeechCommandClassifier(n_mfcc=40, num_classes=len(label_map)).to(device)

# 損失関数とオプティマイザーの定義
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def train_loop(dataloader, model, loss_fn, optimizer):
    model.train()
    total_loss = 0
    correct = 0
    total_samples = 0
    for batch, (X, y) in enumerate(dataloader):
        X = X.to(device)
        y = y.to(device)
        # 不要な次元の追加を避ける
        # X = X.unsqueeze(1)  # この行は不要
        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * X.size(0)
        correct += (pred.argmax(1) == y).type(torch.float).sum().item()
        total_samples += X.size(0)

        if batch % 100 == 0:
            print(f"Batch {batch}, Loss: {loss.item():>7f}")

    avg_loss = total_loss / total_samples
    accuracy = correct / total_samples
    print(f"Train - Avg loss: {avg_loss:>8f}, Accuracy: {(100 * accuracy):>0.1f}%")

def test_loop(dataloader, model, loss_fn):
    model.eval()
    total_loss = 0
    correct = 0
    total_samples = 0
    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
            # 不要な次元の追加を避ける
            # X = X.unsqueeze(1)  # この行は不要
            pred = model(X)
            total_loss += loss_fn(pred, y).item() * X.size(0)
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            total_samples += X.size(0)

    avg_loss = total_loss / total_samples
    accuracy = correct / total_samples
    print(f"Test - Avg loss: {avg_loss:>8f}, Accuracy: {(100 * accuracy):>0.1f}%")

# 学習ループ
epochs = 5
for epoch in range(epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_loop(train_loader, model, loss_fn, optimizer)
    test_loop(validation_loader, model, loss_fn)
print("訓練完了！")


100%|██████████| 2.26G/2.26G [00:09<00:00, 245MB/s]


Epoch 1
-------------------------------
Batch 0, Loss: 4.286769
Batch 100, Loss: 1.445020
Batch 200, Loss: 1.190274
Batch 300, Loss: 1.205915
Batch 400, Loss: 0.936719
Batch 500, Loss: 0.794121
Batch 600, Loss: 0.670147
Batch 700, Loss: 0.869510
Batch 800, Loss: 0.909492
Batch 900, Loss: 0.641240
Batch 1000, Loss: 0.605613
Batch 1100, Loss: 0.526469
Batch 1200, Loss: 0.308218
Batch 1300, Loss: 0.542874
Train - Avg loss: 0.932779, Accuracy: 72.6%
Test - Avg loss: 0.563725, Accuracy: 82.4%
Epoch 2
-------------------------------
Batch 0, Loss: 0.541320
Batch 100, Loss: 0.618990
Batch 200, Loss: 0.392678
Batch 300, Loss: 0.439378
Batch 400, Loss: 0.643299
Batch 500, Loss: 0.655211
Batch 600, Loss: 0.520213
Batch 700, Loss: 0.239718
Batch 800, Loss: 0.486059
Batch 900, Loss: 0.494806
Batch 1000, Loss: 0.546382
Batch 1100, Loss: 0.394878
Batch 1200, Loss: 0.261218
Batch 1300, Loss: 0.227475
Train - Avg loss: 0.470116, Accuracy: 84.9%
Test - Avg loss: 0.458786, Accuracy: 84.8%
Epoch 3
------