In [4]:
import os
import zipfile
import librosa
import numpy as np
import pandas as pd
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification,Wav2Vec2ForCTC
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader


In [5]:

# 경로 설정
dataset_path = './output'
audio_path = os.path.join(dataset_path, 'audio')

# 메타데이터 로드
metadata = pd.read_csv(os.path.join(dataset_path, '314_TRANSCRIPT.csv'))

# Wav2Vec2 프로세서 및 모델 초기화
processor = Wav2Vec2Processor.from_pretrained("kresnik/wav2vec2-large-xlsr-korean")
model = Wav2Vec2ForCTC.from_pretrained("kresnik/wav2vec2-large-xlsr-korean", num_labels = 2).to('cuda')

# 모델을 GPU로 이동
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 커스텀 데이터셋 클래스 정의
class DepressionDataset(Dataset):
    def __init__(self, metadata, processor, audio_path):
        self.metadata = metadata
        self.processor = processor
        self.audio_path = audio_path

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        file_path = os.path.join(self.audio_path, self.metadata.iloc[idx]['file'] + '.wav')
        label = self.metadata.iloc[idx]['label']
        y, sr = librosa.load(file_path, sr=16000)
        inputs = self.processor(y, sampling_rate=sr, return_tensors="pt", padding=True)
        return {'input_values': inputs.input_values.squeeze(), 'attention_mask': inputs.attention_mask.squeeze(), 'labels': torch.tensor(label)}

# 데이터셋 및 데이터로더 생성
dataset = DepressionDataset(metadata, processor, audio_path)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


Some weights of the model checkpoint at kresnik/wav2vec2-large-xlsr-korean were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at kresnik/wav2vec2-large-xlsr-korean and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRA