# 사전 환경 만들기

## 구글 드라이브 마운트

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 필요한 라이브러리 설치 및 임포트

In [2]:
!pip install fsspec==2024.10.0
!pip install transformers datasets

import os
import torch
import torchaudio
import torchaudio.transforms as T
import numpy as np
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor, DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from datasets import Dataset
!pip install evaluate
from evaluate import load as load_metric


Collecting fsspec==2024.10.0
  Using cached fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Using cached fsspec-2024.10.0-py3-none-any.whl (179 kB)
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.9.0
    Uninstalling fsspec-2024.9.0:
      Successfully uninstalled fsspec-2024.9.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 3.1.0 requires fsspec[http]<=2024.9.0,>=2023.1.0, but you have fsspec 2024.10.0 which is incompatible.[0m[31m
[0mSuccessfully installed fsspec-2024.10.0
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Using cached fsspec-2024.9.0-py3-none-any.whl (179 kB)
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing install

## GPU 사용 가능 여부 확인

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# 데이터 준비

## 데이터셋 경로 설정 (경로는 사용자마다 다름)

In [4]:
real_path = "/content/drive/MyDrive/Colab Notebooks/wav2vec classification/real"  # real 폴더 경로
fake_path = "/content/drive/MyDrive/Colab Notebooks/wav2vec classification/fake"  # fake 폴더 경로

## 데이터셋 불러오기 및 라벨링

In [5]:
# 데이터셋 불러오기 및 라벨링
def load_data(folder_path, label, max_files=None):
    files = []
    labels = []
    for i, file in enumerate(os.listdir(folder_path)):
        if file.endswith('.ogg'):
            files.append(os.path.join(folder_path, file))
            labels.append(label)
        if max_files is not None and i + 1 >= max_files:
            break
    return files, labels

# 데이터셋을 적은 수의 샘플로 불러오기 (각 폴더에서 최대 5개 파일만 사용)
real_files, real_labels = load_data(real_path, 1, max_files=5)  # 진짜 음성 라벨: 1, 최대 5개 파일
fake_files, fake_labels = load_data(fake_path, 0, max_files=5)  # 가짜 음성 라벨: 0, 최대 5개 파일

all_files = real_files + fake_files
all_labels = real_labels + fake_labels

## 학습용 데이터와 검증용 데이터로 나누기

In [6]:
train_files, test_files, train_labels, test_labels = train_test_split(all_files, all_labels, test_size=0.2, random_state=42)


## Wav2Vec2 모델과 프로세서 불러오기

In [7]:
model_name = "facebook/wav2vec2-base"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 데이터셋 생성

In [8]:
# Resampler 정의 (32,000 Hz -> 16,000 Hz)
resampler = T.Resample(orig_freq=32000, new_freq=16000)

# 데이터셋 생성 및 처리 - 재샘플링 포함
def preprocess(file_path):
    speech, sample_rate = torchaudio.load(file_path)
    if sample_rate != 16000:
        speech = resampler(speech)
    return {'speech': speech.squeeze().numpy()}

data = {'file_path': train_files, 'label': train_labels}
dataset = Dataset.from_dict(data)
dataset = dataset.map(lambda example: {
    'input_values': processor(preprocess(example['file_path'])['speech'], sampling_rate=16000, return_tensors="pt", padding=True).input_values.squeeze(),
    'label': example['label']
})

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

# 모델 만들고 학습하기

## 1. 양자화 (Quantization)

In [9]:
## 양자화는 나중에

## 2. Low-Rank Approximation

In [10]:
def apply_low_rank_approximation(model, rank=16):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            # 저랭크 근사 적용
            weight = module.weight.data.cpu().numpy()
            U, S, Vt = np.linalg.svd(weight, full_matrices=False)
            U = U[:, :rank]
            S = np.diag(S[:rank])
            Vt = Vt[:rank, :]
            new_weight = torch.tensor(U @ S @ Vt, dtype=module.weight.dtype)
            module.weight.data = new_weight.to(module.weight.device)

apply_low_rank_approximation(model)


## 모델 학습을 위한 파라미터 설정

In [11]:
from torch.utils.data import DataLoader
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_steps=10,
    save_total_limit=2,
    report_to='none'  # W&B 사용 비활성화
)




## DataLoader 준비

In [12]:
train_dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


## 학습 루프 구현

In [13]:
import numpy as np
from transformers import Trainer

metric = load_metric("accuracy")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return metric.compute(predictions=preds, references=labels)

# DataCollatorWithPadding을 사용하여 데이터의 길이를 맞춤
data_collator = DataCollatorWithPadding(tokenizer=processor, padding=True)

# 트레이너 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    data_collator=data_collator,
    tokenizer=processor,
    compute_metrics=compute_metrics
)



## 모델 학습

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.687864,0.625
2,No log,0.662841,0.875
3,No log,0.652606,0.875


TrainOutput(global_step=3, training_loss=0.6820816993713379, metrics={'train_runtime': 12.7311, 'train_samples_per_second': 1.885, 'train_steps_per_second': 0.236, 'total_flos': 1045858480128000.0, 'train_loss': 0.6820816993713379, 'epoch': 3.0})

# 모델 평가

In [15]:
# 테스트 데이터셋 평가함수 생성
def evaluate(model, files, labels):
    model.eval()
    correct = 0
    total = len(files)

    with torch.no_grad():
        for i, file_path in enumerate(files):
            speech, sample_rate = torchaudio.load(file_path)

            # 샘플링 속도가 16,000이 아닌 경우 재샘플링 적용
            if sample_rate != 16000:
                speech = resampler(speech)

            # 모델의 입력 형식에 맞게 전처리
            inputs = processor(speech.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True).to(device)

            logits = model(**inputs).logits
            prediction = torch.argmax(logits, dim=-1).item()

            if prediction == labels[i]:
                correct += 1

    accuracy = correct / total
    print(f"Accuracy: {accuracy * 100:.2f}%")



# 평가
evaluate(model, test_files, test_labels)


Accuracy: 50.00%
