# CNN Model

# 0. Hyperparameters

In [1]:
class Config:
    batch_size = 64
    learning_rate = 1e-3
    num_epochs = 20
    scheduler_gamma = 0.9
    train_loader = None
    test_loader = None

In [2]:
config = Config

## 0.1. Load Processor

In [3]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small")

# 1. Preprocessing

## 1.1. Load Dataset

In [4]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [5]:
from datasets import load_dataset
prepared_dataset = load_dataset("./prepared_dataset")

Resolving data files:   0%|          | 0/129 [00:00<?, ?it/s]

In [6]:
# 제대로 불러왔는지 테스트해보자.
prepared_dataset

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels', 'class'],
        num_rows: 66514
    })
})

## 1.2. Split Train Dataset and Validation Dataset

In [7]:
# train dataset과 test dataset으로 분리한다.
split_dataset = prepared_dataset['train'].train_test_split(test_size=0.01, seed=42)

In [8]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels', 'class'],
        num_rows: 65848
    })
    test: Dataset({
        features: ['input_features', 'labels', 'class'],
        num_rows: 666
    })
})

In [9]:
split_dataset.set_format("torch")

train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

In [10]:
next(iter(train_dataset))

{'input_features': tensor([[ 0.3997,  0.5925,  0.5563,  ...,  0.4520,  0.4180,  0.5115],
         [ 0.7106,  0.7089,  0.4134,  ...,  0.4954,  0.3103,  0.6480],
         [ 0.6471,  0.6571,  0.6065,  ...,  0.4870,  0.4367,  0.5973],
         ...,
         [-0.3370, -0.3724, -0.3525,  ..., -0.2762, -0.3248, -0.2963],
         [-0.3280, -0.3598, -0.4122,  ..., -0.3429, -0.3176, -0.2677],
         [-0.2644, -0.5422, -0.5633,  ..., -0.3347, -0.3671, -0.3838]]),
 'labels': tensor([50258, 50264, 50359, 50363, 50257]),
 'class': tensor(16)}

## 1.3. Data Collate Function

In [11]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # 인풋 데이터와 라벨 데이터의 길이가 다르며, 따라서 서로 다른 패딩 방법이 적용되어야 한다. 그러므로 두 데이터를 분리해야 한다.
        # 먼저 오디오 인풋 데이터를 간단히 토치 텐서로 반환하는 작업을 수행한다.
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Tokenize된 레이블 시퀀스를 가져온다.
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # 레이블 시퀀스에 대해 최대 길이만큼 패딩 작업을 실시한다.
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # 패딩 토큰을 -100으로 치환하여 loss 계산 과정에서 무시되도록 한다.
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # 이전 토크나이즈 과정에서 bos 토큰이 추가되었다면 bos 토큰을 잘라낸다.
        # 해당 토큰은 이후 언제든 추가할 수 있다.
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        class_list = [feature["class"] for feature in features]
        classes = torch.stack(class_list)
        classes -= 1 # torch.tensor([1, 2, 3, ..., 19])를 torch.tensor([0, 1, 2, ..., 18])로 변경

        return batch, classes

In [12]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

## 1.4. DataLoader

In [13]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=Config.batch_size, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=Config.batch_size, collate_fn=data_collator)

- config에 train_loader와 test_loader를 추가한다.

In [14]:
config.train_loader = train_dataloader
config.test_loader = test_dataloader

In [15]:
next(iter(train_dataloader))

({'input_features': tensor([[[ 0.3997,  0.5925,  0.5563,  ...,  0.4520,  0.4180,  0.5115],
          [ 0.7106,  0.7089,  0.4134,  ...,  0.4954,  0.3103,  0.6480],
          [ 0.6471,  0.6571,  0.6065,  ...,  0.4870,  0.4367,  0.5973],
          ...,
          [-0.3370, -0.3724, -0.3525,  ..., -0.2762, -0.3248, -0.2963],
          [-0.3280, -0.3598, -0.4122,  ..., -0.3429, -0.3176, -0.2677],
          [-0.2644, -0.5422, -0.5633,  ..., -0.3347, -0.3671, -0.3838]],
 
         [[ 0.0030, -0.3602, -0.4643,  ..., -0.4643, -0.4643, -0.4643],
          [ 0.0299, -0.1788, -0.2602,  ..., -0.4643, -0.4643, -0.4643],
          [ 0.1383, -0.0552, -0.3880,  ..., -0.4643, -0.4643, -0.4643],
          ...,
          [-0.4643, -0.4643, -0.4643,  ..., -0.4643, -0.4643, -0.4643],
          [-0.4643, -0.4643, -0.4643,  ..., -0.4643, -0.4643, -0.4643],
          [-0.4643, -0.4643, -0.4643,  ..., -0.4643, -0.4643, -0.4643]],
 
         [[-0.4043,  0.1011,  0.2221,  ..., -0.6095, -0.6095, -0.6095],
         

# 2. Model Architecture

In [16]:
import torch
import torch.nn as nn

class MyNet(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=3, stride=(1, 2), padding=(1, 1)), # (?, 1, 80, 3000) -> (?, 8, 80, 1500)
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)) # (?, 8, 80, 1500) -> (?, 8, 40, 750)
        self.conv2 = nn.Sequential(
            nn.Conv2d(8, 16, kernel_size=3, stride=(1, 2), padding=(1, 2)), # (?, 8, 40, 750) -> (?, 16, 40, 376)
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)) # (?, 16, 40, 376) -> (?, 16, 20, 188)
        self.conv3 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, stride=(1, 2), padding=(1, 1)), # (?, 16, 20, 188) -> (?, 32, 20, 94)
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)) # (?, 32, 20, 94) -> (?, 32, 10, 47)
        self.conv4 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=(1, 2), padding=(1, 2)), # (?, 32, 10, 47) -> (?, 64, 10, 24)
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)) # (?, 64, 10, 24) -> (?, 64, 5, 12)
        self.conv5 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=(1, 2), padding=(1, 1)), # (?, 64, 5, 12) -> (?, 128, 5, 6)
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(1, 2), stride=(1, 2))) # (?, 128, 5, 6) -> (?, 128, 5, 3)

        self.fc1 = nn.Linear(128*5*3, 128, bias=True) # (?, 256, 5, 3) -> (?, 128)
        nn.init.kaiming_normal_(self.fc1.weight, nonlinearity='relu') # He Initialization
        self.linear1 = nn.Sequential(
            self.fc1,
            nn.ReLU(),
            nn.Dropout(p=0.2)) # 20% dropout

        self.fc2 = nn.Linear(128, 64, bias=True) # (?, 128) -> (?, 64)
        nn.init.kaiming_normal_(self.fc2.weight, nonlinearity='relu')
        self.linear2 = nn.Sequential(
            self.fc2,
            nn.ReLU(),
            nn.Dropout(p=0.2)) # 20% dropout

        self.fc = nn.Linear(64, 19, bias=True) # (?, 64), (?, 19)

    def forward(self, x):
        x = x.view(-1, 1, 80, 3000)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = x.view(-1, 128*5*3)
        x = self.linear1(x)
        x = self.linear2(x)
        x = self.fc(x)
        return x

model = MyNet().to(device)

# 적절한 optimizer, Scheduler를 선택: torch.optim을 사용
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=1e-4) # Adam Optimizer + L2 Regularization(weight_decay)
# scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=config.scheduler_gamma) # Exponential Learning Rate Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2)

# 적절한 Loss function을 선택: nn 모듈의 Function을 사용
criterion = nn.CrossEntropyLoss()

# 3. Training

## 3.1. Define Train, Evaluate, Predict Functions

In [17]:
from tqdm import tqdm
import numpy as np

def train(model, train_loader):
    model.train() # 모델을 훈련 모드로 설정합니다.
    train_loss = 0
    correct = 0

    with tqdm(total=len(train_loader), desc="Training") as pbar:
        for features, labels in train_loader:
            input_features = features['input_features']
            input_features = input_features.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad() # optimizer의 gradient를 초기화합니다.
            output = model(input_features) # 모델의 출력 결과를 저장합니다.
            
            loss = criterion(output, labels) # 손실함수를 계산합니다.
            loss.backward() # 역전파를 실행합니다.
            optimizer.step() # optimizer로 가중치를 업데이트합니다.
    
            train_loss += loss.item()
            prediction = output.max(1, keepdim = True)[1]
            correct += prediction.eq(labels.view_as(prediction)).sum().item()
            pbar.update(1)
      
    train_loss /= len(train_loader)
    train_accuracy = 100. * correct / len(train_loader.dataset)
    return train_loss, train_accuracy

def evaluate(model, test_loader):
    model.eval()
    test_loss = 0
    correct = 0

    with torch.no_grad():
        with tqdm(total=len(test_loader), desc="Evaluating") as pbar:
            for features, labels in test_loader:
                input_features = features['input_features']
                input_features = input_features.to(device)
                labels = labels.to(device)
                
                output = model(input_features)
                
                test_loss += criterion(output, labels).item()
                prediction = output.max(1, keepdim = True)[1]
                correct += prediction.eq(labels.view_as(prediction)).sum().item()
                pbar.update(1)

    test_loss /= len(test_loader)
    test_accuracy = 100. * correct / len(test_loader.dataset)

    scheduler.step(test_loss)
    return test_loss, test_accuracy

def pred(model, test_loader):
    model.eval()
    pred_li = [] # 예측 결과를 모두 저장하는 리스트입니다.

    with torch.no_grad():
        with tqdm(total=len(test_loader), desc="Predicting") as pbar:
            for features, labels in test_loader:
                input_features = features['input_features']
                input_features = input_features.to(device)
                output = model(input_features)
                pred_li.append(output.cpu().numpy().argmax(axis=1))
                pbar.update(1)

    return np.concatenate(pred_li) # 예측 결과

## 3.2. Define Main Function

In [18]:
import os
import pandas as pd

def main(model, config:Config, save_dir):
    result_list = []

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    num_epochs = config.num_epochs
    for epoch in range(1, num_epochs+1):
        train_loss, train_accuracy = train(model, config.train_loader)
        val_loss, val_accuracy = evaluate(model, config.test_loader)
        print(f"[EPOCH: {epoch}], \tTrain Loss: {train_loss:.4f}, \tTrain Accuracy: {train_accuracy:.2f} %, \tVal Loss: {val_loss:.4f}, \tVal Accuracy: {val_accuracy:.2f} % \n")
        result = {
            'epoch': epoch,
            'train_loss': train_loss,
            'train_accuracy': train_accuracy,
            'val_loss': val_loss,
            'val_accuracy': val_accuracy
        }
        scheduler.step(val_loss)
        result_list.append(result)
        
        torch.save({
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'epoch': epoch
        }, os.path.join(save_dir, f'ckpt-{epoch}.pt'))
            

    result_df = pd.DataFrame(result_list)
    result_df.to_csv(f'{save_dir}/log_history.csv', index=False)

- 주의!! 아래는 실행에 주의할 것.

- 주의!! 아래는 실행에 주의할 것.

- 주의!! 아래는 실행에 주의할 것.

In [19]:
main(model, config, './save06')
"""
class Config:
    batch_size = 64
    learning_rate = 1e-3
    num_epochs = 20
scheuler: torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patient=2) (적용 됨)
"""
# EPOCH 13 : Val Accuracy: 96.40%

Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [07:29<00:00,  2.29it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:05<00:00,  1.95it/s]


[EPOCH: 1], 	Train Loss: 1.4472, 	Train Accuracy: 48.77 %, 	Val Loss: 0.8116, 	Val Accuracy: 71.77 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [09:06<00:00,  1.88it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.24it/s]


[EPOCH: 2], 	Train Loss: 0.7715, 	Train Accuracy: 72.24 %, 	Val Loss: 0.5108, 	Val Accuracy: 83.03 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [09:02<00:00,  1.90it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.30it/s]


[EPOCH: 3], 	Train Loss: 0.5421, 	Train Accuracy: 81.07 %, 	Val Loss: 0.4080, 	Val Accuracy: 85.89 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [08:47<00:00,  1.95it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:05<00:00,  2.13it/s]


[EPOCH: 4], 	Train Loss: 0.4115, 	Train Accuracy: 85.95 %, 	Val Loss: 0.2852, 	Val Accuracy: 89.79 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [08:50<00:00,  1.94it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.32it/s]


[EPOCH: 5], 	Train Loss: 0.3368, 	Train Accuracy: 88.64 %, 	Val Loss: 0.2689, 	Val Accuracy: 90.54 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [08:44<00:00,  1.96it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:05<00:00,  1.86it/s]


[EPOCH: 6], 	Train Loss: 0.2844, 	Train Accuracy: 90.37 %, 	Val Loss: 0.2147, 	Val Accuracy: 92.34 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [08:34<00:00,  2.00it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:05<00:00,  2.10it/s]


[EPOCH: 7], 	Train Loss: 0.2442, 	Train Accuracy: 91.72 %, 	Val Loss: 0.1742, 	Val Accuracy: 93.84 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [09:12<00:00,  1.86it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.27it/s]


[EPOCH: 8], 	Train Loss: 0.2173, 	Train Accuracy: 92.59 %, 	Val Loss: 0.1593, 	Val Accuracy: 94.29 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [09:12<00:00,  1.86it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:05<00:00,  2.13it/s]


[EPOCH: 9], 	Train Loss: 0.1937, 	Train Accuracy: 93.45 %, 	Val Loss: 0.1480, 	Val Accuracy: 94.29 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [09:13<00:00,  1.86it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:05<00:00,  1.85it/s]


[EPOCH: 10], 	Train Loss: 0.1782, 	Train Accuracy: 93.91 %, 	Val Loss: 0.1565, 	Val Accuracy: 94.14 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [09:08<00:00,  1.88it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:06<00:00,  1.82it/s]


[EPOCH: 11], 	Train Loss: 0.1176, 	Train Accuracy: 95.98 %, 	Val Loss: 0.1125, 	Val Accuracy: 95.80 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [09:10<00:00,  1.87it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.24it/s]


[EPOCH: 12], 	Train Loss: 0.1038, 	Train Accuracy: 96.40 %, 	Val Loss: 0.1109, 	Val Accuracy: 96.10 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [09:04<00:00,  1.89it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.62it/s]


[EPOCH: 13], 	Train Loss: 0.0943, 	Train Accuracy: 96.71 %, 	Val Loss: 0.1055, 	Val Accuracy: 96.40 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [08:59<00:00,  1.91it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:05<00:00,  2.06it/s]


[EPOCH: 14], 	Train Loss: 0.0896, 	Train Accuracy: 96.90 %, 	Val Loss: 0.1061, 	Val Accuracy: 95.80 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [09:01<00:00,  1.90it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.42it/s]


[EPOCH: 15], 	Train Loss: 0.0818, 	Train Accuracy: 97.17 %, 	Val Loss: 0.1012, 	Val Accuracy: 96.25 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [08:48<00:00,  1.95it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.56it/s]


[EPOCH: 16], 	Train Loss: 0.0803, 	Train Accuracy: 97.13 %, 	Val Loss: 0.1011, 	Val Accuracy: 96.10 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [08:58<00:00,  1.91it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.32it/s]


[EPOCH: 17], 	Train Loss: 0.0798, 	Train Accuracy: 97.20 %, 	Val Loss: 0.1017, 	Val Accuracy: 96.25 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [08:39<00:00,  1.98it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.59it/s]


[EPOCH: 18], 	Train Loss: 0.0784, 	Train Accuracy: 97.27 %, 	Val Loss: 0.1017, 	Val Accuracy: 96.25 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [08:46<00:00,  1.95it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.43it/s]


[EPOCH: 19], 	Train Loss: 0.0771, 	Train Accuracy: 97.36 %, 	Val Loss: 0.1018, 	Val Accuracy: 96.10 % 



Training: 100%|████████████████████████████████████████████████████████████████████| 1029/1029 [08:50<00:00,  1.94it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.50it/s]


[EPOCH: 20], 	Train Loss: 0.0781, 	Train Accuracy: 97.28 %, 	Val Loss: 0.1017, 	Val Accuracy: 96.10 % 



'\nclass Config:\n    batch_size = 64\n    learning_rate = 1e-3\n    num_epochs = 20\nscheuler: torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patient=2) (적용 됨)\n'

# Practice

In [20]:
p = torch.tensor([[0.2, 0.3, 0.4], [0.3, 0.4, 0.2]]).max(1, keepdim = True)[1]
p

tensor([[2],
        [1]])

In [21]:
l = torch.tensor([1, 1])
p.eq(l.view_as(p)).sum().item()

1