## import

In [4]:
import PIL

In [5]:
import librosa

from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import random
from PIL import Image
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

import torch
import torchmetrics
import os
import warnings

warnings.filterwarnings('ignore')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [18]:
import torch

print("PyTorch 버전:", torch.__version__)
print("CUDA 버전:", torch.version.cuda)
print("CUDA 사용 가능:", torch.cuda.is_available())
print("cuDNN 사용 가능:", torch.backends.cudnn.enabled)

if torch.cuda.is_available():
    print("GPU 수:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i} 이름:", torch.cuda.get_device_name(i))
else:
    print("GPU를 찾을 수 없습니다. CPU를 사용합니다.")

print(device)
print("cuDNN 버전:", torch.backends.cudnn.version())

PyTorch 버전: 1.12.1+cu116
CUDA 버전: 11.6
CUDA 사용 가능: True
cuDNN 사용 가능: True
GPU 수: 1
GPU 0 이름: NVIDIA GeForce RTX 2060
cuda
cuDNN 버전: 8302


## Config

In [88]:
class Config:
    #SR = 32000
    SR = 16000
    N_MFCC = 13
    # Dataset
    ROOT_FOLDER = './'
    # Training
    N_CLASSES = 2
    BATCH_SIZE = 96
    N_EPOCHS = 5
    LR = 3e-4
    # Others
    SEED = 42
    
CONFIG = Config()

## Seed

In [8]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CONFIG.SEED) # Seed 고정

## Train & Validation Split - 전처리

In [258]:
print(df.head())

         id                  path label
0  RUNQPNJF  ./train/RUNQPNJF.ogg  real
1  JFAWUOGJ  ./train/JFAWUOGJ.ogg  fake
2  RDKEKEVX  ./train/RDKEKEVX.ogg  real
3  QYHJDOFK  ./train/QYHJDOFK.ogg  real
4  RSPQNHAO  ./train/RSPQNHAO.ogg  real


In [259]:
print(df.shape)

(55438, 3)


In [52]:
df = pd.read_csv('./train.csv')
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=CONFIG.SEED)

In [48]:
import inspect
print(inspect.signature(librosa.feature.mfcc))

(*, y: Optional[numpy.ndarray] = None, sr: float = 22050, S: Optional[numpy.ndarray] = None, n_mfcc: int = 20, dct_type: int = 2, norm: Optional[str] = 'ortho', lifter: float = 0, **kwargs: Any) -> numpy.ndarray


In [56]:
def add_noise(data, noise_factor=0.005):
    noise = np.random.randn(len(data))
    augmented_data = data + noise_factor * noise
    # Cast back to same data type
    augmented_data = augmented_data.astype(type(data[0]))
    return augmented_data

def change_pitch(data, sampling_rate = CONFIG.SR, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data,  sr=CONFIG.SR, n_steps=pitch_factor)

def stretch_time(data, stretch_factor=0.8):
    return librosa.effects.time_stretch(data, rate=stretch_factor)

def get_mfcc_feature(df, train_mode=True):
    features = []
    labels = []
    for _, row in tqdm(df.iterrows()):
        path = row['path']
        y, sr = librosa.load(path, sr=CONFIG.SR)
        
        if train_mode:
            y = add_noise(y)
            y = change_pitch(y)
            y = stretch_time(y)
        
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CONFIG.N_MFCC)
        mfcc = np.mean(mfcc.T, axis=0)
        features.append(mfcc)

        if train_mode:
            label = row['label']
            label_vector = np.zeros(CONFIG.N_CLASSES, dtype=float)
            label_vector[0 if label == 'fake' else 1] = 1
            labels.append(label_vector)

    if train_mode:
        return np.array(features), np.array(labels)
    return np.array(features)

# Augmented training data
train_data = pd.read_csv('./train.csv')

train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=CONFIG.SEED)

# Extract MFCC features for training and validation sets
train_mfcc, train_labels = get_mfcc_feature(train, train_mode=True)
val_mfcc, val_labels = get_mfcc_feature(val, train_mode=True) 

44350it [1:07:12, 11.00it/s]
11088it [16:48, 10.99it/s]


## Data Pre-processing : MFCC

In [265]:
def get_mfcc_feature(df, train_mode=True):
    features = []
    labels = []
    for _, row in tqdm(df.iterrows()):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(row['path'], sr=CONFIG.SR)
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CONFIG.N_MFCC)
        mfcc = np.mean(mfcc.T, axis=0)
        features.append(mfcc)

        if train_mode:
            label = row['label']
            label_vector = np.zeros(CONFIG.N_CLASSES, dtype=float)
            label_vector[0 if label == 'fake' else 1] = 1
            labels.append(label_vector)

    if train_mode:
        return features, labels
    return features
    
train_mfcc, train_labels = get_mfcc_feature(train, True)
val_mfcc, val_labels = get_mfcc_feature(val, True)

AttributeError: 'function' object has no attribute 'iterrows'

## CustomDataset

In [266]:
# MFCC 형태 확인
print(f"MFCC shape: {np.array(train_mfcc).shape}")
print(f"First MFCC feature shape: {np.array(train_mfcc[0]).shape}")

MFCC shape: (44350, 13)
First MFCC feature shape: (13,)


In [60]:
class CustomDataset(Dataset):
    def __init__(self, mfcc, label):
        self.mfcc = mfcc
        self.label = label

    def __len__(self):
        return len(self.mfcc)

    def __getitem__(self, index):
        if self.label is not None:
            return self.mfcc[index], self.label[index]
        return self.mfcc[index]
   
train_dataset = CustomDataset(train_mfcc, train_labels)
val_dataset = CustomDataset(val_mfcc, val_labels)
train_loader = DataLoader(
    train_dataset,
    batch_size=CONFIG.BATCH_SIZE,
    shuffle=True
)
val_loader = DataLoader(
    val_dataset,
    batch_size=CONFIG.BATCH_SIZE,
    shuffle=False
)

In [108]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchaudio

# Custom Dataset 클래스
class CustomDataset(Dataset):
    def __init__(self, mfcc, labels):
        self.mfcc = mfcc
        self.labels = labels

    def __len__(self):
        return len(self.mfcc)

    def __getitem__(self, index):
        mfcc = self.mfcc[index]
        label = self.labels[index]
        # reshape to (seq_len, feature_dim)
        mfcc = mfcc.reshape(-1, mfcc.shape[-1])
        return mfcc, label

train_dataset = CustomDataset(train_mfcc, train_labels)
val_dataset = CustomDataset(val_mfcc, val_labels)
train_loader = DataLoader(
    train_dataset,
    batch_size=CONFIG.BATCH_SIZE,
    shuffle=True
)
val_loader = DataLoader(
    val_dataset,
    batch_size=CONFIG.BATCH_SIZE,
    shuffle=False
)

## Define Model

In [61]:
class MLP(nn.Module):
    def __init__(self, input_dim=CONFIG.N_MFCC, hidden_dim=128, output_dim=CONFIG.N_CLASSES):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        x = torch.sigmoid(x)
        return x

In [262]:
class AudioCNN(nn.Module):
    def __init__(self):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        # fc1의 입력 크기를 일단 placeholder 값으로 설정합니다.
        self.fc1 = nn.Linear(64 * 10 * 43, 128)
        self.fc2 = nn.Linear(128, 2)

    def forward(self, x):
        print(f"Input shape: {x.shape}")
        x = self.pool(F.relu(self.conv1(x)))
        print(f"After conv1 and pool: {x.shape}")
        x = self.pool(F.relu(self.conv2(x)))
        print(f"After conv2 and pool: {x.shape}")
        x = x.view(x.size(0), -1)  # Flatten the tensor
        print(f"After flatten: {x.shape}")
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

dummy_input = torch.randn(1, 1, 40, 174)  # 배치 크기 1, 채널 1, MFCC 크기 (40, 174)
model = AudioCNN()
output = model(dummy_input)

Input shape: torch.Size([1, 1, 40, 174])
After conv1 and pool: torch.Size([1, 32, 20, 87])
After conv2 and pool: torch.Size([1, 64, 10, 43])
After flatten: torch.Size([1, 27520])


## Train & Validation

In [3]:
# 학습 루프
for epoch in range(CONFIG.N_EPOCHS):
    model.train()
    running_loss = 0.0
    num_batches = len(train_loader)
    correct = 0
    total = 0
    
    # tqdm을 사용하여 진행률 막대 추가
    progress_bar = tqdm(enumerate(train_loader), total=num_batches, desc=f"Epoch {epoch+1}/{CONFIG.N_EPOCHS}")
    
    for i, (features, labels) in progress_bar:
        features = features.unsqueeze(1).float().to(device)  # 채널 차원 추가 및 FloatTensor로 변환
        labels = labels.float().to(device)  # FloatTensor로 변환
        
        optimizer.zero_grad()
        output = model(features)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        # 정확도 계산
        predicted = (output.squeeze() > 0.5).float()
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
        accuracy = 100 * correct / total
        
        # 진행률 막대에 손실 값과 정확도 업데이트
        progress_bar.set_postfix(loss=running_loss/(i+1), accuracy=accuracy)
    
    epoch_loss = running_loss / num_batches
    print(f"Epoch [{epoch+1}/{CONFIG.N_EPOCHS}], Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.2f}%")

# 평가 루프 (선택적)
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for features, labels in val_loader:
        features = features.unsqueeze(1).float().to(device)
        labels = labels.float().to(device)
        output = model(features)
        predicted = (output.squeeze() > 0.5).float()
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
    
    accuracy = 100 * correct / total
    print(f'Validation Accuracy: {accuracy:.2f}%')

NameError: name 'CONFIG' is not defined

In [1]:
from sklearn.metrics import roc_auc_score

def train(model, optimizer, train_loader, val_loader, device, num_epochs=10):
    model.to(device)
    criterion = nn.BCELoss().to(device)
    
    best_val_score = 0
    best_model = None
    
    for epoch in range(1, num_epochs+1):
        model.train()
        train_loss = []
        for features, labels in tqdm(iter(train_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)
            
            optimizer.zero_grad()
            
            output = model(features)
            loss = criterion(output, labels)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
                    
        _val_loss, _val_score = validation(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val AUC : [{_val_score:.5f}]')
            
        if best_val_score < _val_score:
            best_val_score = _val_score
            best_model = model
    
    return best_model

def validation(model, criterion, val_loader, device):
    model.eval()
    val_loss, all_labels, all_probs = [], [], []
    
    with torch.no_grad():
        for features, labels in tqdm(iter(val_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)
            
            probs = model(features)
            
            loss = criterion(probs, labels)
            val_loss.append(loss.item())

            all_labels.append(labels.cpu().numpy())
            all_probs.append(probs.cpu().numpy())
        
        _val_loss = np.mean(val_loss)
        all_labels = np.concatenate(all_labels, axis=0)
        all_probs = np.concatenate(all_probs, axis=0)
        
        auc_score = multiLabel_AUC(all_labels, all_probs)
    
    return _val_loss, auc_score

def multiLabel_AUC(y_true, y_scores):
    auc_scores = []
    for i in range(y_true.shape[1]):
        auc = roc_auc_score(y_true[:, i], y_scores[:, i])
        auc_scores.append(auc)
    mean_auc_score = np.mean(auc_scores)
    return mean_auc_score


## Run

In [257]:
model = SimpleCNN()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CONFIG.LR)

infer_model = train(model, optimizer, train_loader, val_loader, device)

  0%|                                                                        | 0/462 [00:00<?, ?it/s]


RuntimeError: Given groups=1, weight of size [32, 1, 3, 3], expected input[1, 96, 1, 13] to have 1 channels, but got 96 channels instead

In [63]:
model = MLP()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CONFIG.LR)

infer_model = train(model, optimizer, train_loader, val_loader, device)

100%|█████████████████████████████████████████████████████████████| 462/462 [00:01<00:00, 245.89it/s]
100%|█████████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 840.16it/s]


Epoch [1], Train Loss : [0.51068] Val Loss : [0.39299] Val AUC : [0.90903]


100%|█████████████████████████████████████████████████████████████| 462/462 [00:01<00:00, 265.30it/s]
100%|█████████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 871.08it/s]


Epoch [2], Train Loss : [0.36436] Val Loss : [0.35884] Val AUC : [0.93616]


100%|█████████████████████████████████████████████████████████████| 462/462 [00:01<00:00, 270.36it/s]
100%|█████████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 853.40it/s]


Epoch [3], Train Loss : [0.31575] Val Loss : [0.29110] Val AUC : [0.95032]


100%|█████████████████████████████████████████████████████████████| 462/462 [00:01<00:00, 274.46it/s]
100%|█████████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 818.35it/s]


Epoch [4], Train Loss : [0.28430] Val Loss : [0.27155] Val AUC : [0.95696]


100%|█████████████████████████████████████████████████████████████| 462/462 [00:01<00:00, 268.26it/s]
100%|█████████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 864.56it/s]

Epoch [5], Train Loss : [0.26676] Val Loss : [0.25988] Val AUC : [0.95934]





## Inference

In [None]:
# 학습 루프
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for mfcc, label in train_loader:
        mfcc, label = mfcc.float(), label.float()
        optimizer.zero_grad()
        output = model(mfcc)
        loss = criterion(output.squeeze(), label)
        loss.backward()
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# 평가 루프
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for mfcc, label in val_loader:
        mfcc, label = mfcc.float(), label.float()
        output = model(mfcc)
        predicted = (output.squeeze() > 0.5).float()
        total += label.size(0)
        correct += (predicted == label).sum().item()
    
    accuracy = 100 * correct / total
    print(f'Validation Accuracy: {accuracy:.2f}%')


In [64]:
test = pd.read_csv('./test.csv')
test_mfcc = get_mfcc_feature(test, False)
test_dataset = CustomDataset(test_mfcc, None)
test_loader = DataLoader(
    test_dataset,
    batch_size=CONFIG.BATCH_SIZE,
    shuffle=False
)
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    predictions = []
    with torch.no_grad():
        for features in tqdm(iter(test_loader)):
            features = features.float().to(device)
            
            probs = model(features)

            probs  = probs.cpu().detach().numpy()
            predictions += probs.tolist()
    return predictions
preds = inference(infer_model, test_loader, device)

50000it [15:33, 53.59it/s]
100%|█████████████████████████████████████████████████████████████| 521/521 [00:01<00:00, 262.26it/s]


## Submission

In [65]:
submit = pd.read_csv('./sample_submission.csv')
submit.iloc[:, 1:] = preds
submit.head()
submit.to_csv('./submit_data_1.csv', index=False)

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit.iloc[:, 1:] = preds
submit.head()
submit.to_csv('./submit_data_1.csv', index=False)

In [72]:
print(inspect.signature(DataLoader))

(dataset: torch.utils.data.dataset.Dataset[+T_co], batch_size: Optional[int] = 1, shuffle: Optional[bool] = None, sampler: Union[torch.utils.data.sampler.Sampler, Iterable, NoneType] = None, batch_sampler: Union[torch.utils.data.sampler.Sampler[Sequence], Iterable[Sequence], NoneType] = None, num_workers: int = 0, collate_fn: Optional[Callable[[List[~T]], Any]] = None, pin_memory: bool = False, drop_last: bool = False, timeout: float = 0, worker_init_fn: Optional[Callable[[int], NoneType]] = None, multiprocessing_context=None, generator=None, *, prefetch_factor: int = 2, persistent_workers: bool = False, pin_memory_device: str = '')
