In [None]:
# @title Setup

competition = "audio understanding"  # @param
# @markdown ---

from google.colab import userdata
import json

# Get the Kaggle credentials from Colab's userdata
username = userdata.get("KAGGLE_USER")
key = userdata.get("KAGGLE_KEY")

# Echo the credentials into the kaggle.json file
!mkdir -p ~/.kaggle
!echo '{{"username":"{username}","key":"{key}"}}' > ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

competition_name = "audio-understanding"
dir_name = "audio_understand"

!kaggle competitions download -c {competition_name}
!mkdir {dir_name}
!unzip /content/{competition_name}.zip -d {dir_name}
!rm -rf audio-understanding.zip

Downloading audio-understanding.zip to /content
 68% 270M/395M [00:00<00:00, 1.41GB/s]
100% 395M/395M [00:00<00:00, 1.42GB/s]
Archive:  /content/audio-understanding.zip
  inflating: audio_understand/speechs/speechs/test/015a21ca-7e23-4b70-93e6-da8a1bb8eaab.wav  
  inflating: audio_understand/speechs/speechs/test/02164b42-47ac-4d45-aa3a-0288ede4380e.wav  
  inflating: audio_understand/speechs/speechs/test/0225a06d-7949-49a6-84a0-0f4c2d9005aa.wav  
  inflating: audio_understand/speechs/speechs/test/02270177-6507-4730-b034-404aea8f8a5c.wav  
  inflating: audio_understand/speechs/speechs/test/02c4081a-4df6-45bc-a24d-58efa0d70c1b.wav  
  inflating: audio_understand/speechs/speechs/test/03d93186-3c38-4e6e-925f-152d489ae8ef.wav  
  inflating: audio_understand/speechs/speechs/test/05d3d878-5ecf-4dd8-9663-ccfa7afaa9da.wav  
  inflating: audio_understand/speechs/speechs/test/0620b87a-52ed-46ea-b0bb-8d722b46e55a.wav  
  inflating: audio_understand/speechs/speechs/test/085f8dec-23c1-433c-a3a6-007f

In [None]:
import pandas as pd
import random
import os

import torchaudio
import torchaudio.transforms as T

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset, Subset


from transformers import WhisperProcessor, WhisperModel
from tqdm import tqdm

## Overview

In [None]:
MODEL_NAME = "openai/whisper-large-v3"
TEST_CSV_PATH = "/content/audio_understand/test.csv"
TEST_AUDIO_DIR = "/content/audio_understand/speechs/speechs/test/"
NUM_LABELS = 6
SAMPLING_RATE = 16000
THRESHOLD = 0.4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = WhisperProcessor.from_pretrained(MODEL_NAME)
whisper_model = WhisperModel.from_pretrained(MODEL_NAME)

# Model WhisperClassifier
class WhisperClassifier(nn.Module):
    def __init__(self, whisper_model=whisper_model, num_labels=NUM_LABELS):
        super().__init__()
        self.encoder = whisper_model.encoder
        self.encoder_block = nn.TransformerEncoder(
                        nn.TransformerEncoderLayer(d_model=1280, nhead=8, dropout=0.1, batch_first=True, activation='gelu'),
                        num_layers=2
                        )
        self.classifier = nn.Sequential(
            nn.Linear(1280, num_labels),
        )
        self.weight_proj = nn.Linear(1280, 1)

    def forward(self, input_features_1, input_features_2):
        outputs_1 = self.encoder(input_features=input_features_1).last_hidden_state
        outputs_2 = self.encoder(input_features=input_features_2).last_hidden_state
        cat_outputs = torch.cat([outputs_1, outputs_2], dim=1)
        x_attn = self.encoder_block(cat_outputs)
        weights = torch.softmax(self.weight_proj(x_attn), dim=1)
        pooled = (x_attn * weights).sum(dim=1)
        logits = self.classifier(pooled)
        return logits


# Data loader for Test Data
class WhisperTestDataset(Dataset):
    def __init__(self, csv_path, audio_dir, processor, sampling_rate=16000):
        self.df = pd.read_csv(csv_path)
        self.audio_dir = audio_dir
        self.processor = processor
        self.sampling_rate = sampling_rate

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        audio_path = os.path.join(self.audio_dir, row['id'] + '.wav')
        waveform, sr = torchaudio.load(audio_path)
        if sr != self.sampling_rate:
            waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=self.sampling_rate)(waveform)
        waveform = waveform.squeeze(0)
        chunck_lst = []
        for num_chuck in range(2):
            inputs = self.processor(waveform[num_chuck*480000: (num_chuck+1)*480000], sampling_rate=self.sampling_rate, return_tensors="pt")
            input_features = inputs.input_features.squeeze(0)
            chunck_lst.append(input_features)
        #inputs = self.processor(waveform, sampling_rate=self.sampling_rate, return_tensors="pt")
        #input_features = inputs.input_features.squeeze(0)
        return row['id'], {"input_features_1": chunck_lst[0], "input_features_2": chunck_lst[1]}

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

In [None]:
model = WhisperClassifier(num_labels=NUM_LABELS).to(DEVICE)

# Test Loader
test_dataset = WhisperTestDataset(TEST_CSV_PATH, TEST_AUDIO_DIR, processor, sampling_rate=SAMPLING_RATE)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# Inference
model.eval()
results = []

with torch.no_grad():
    for audio_id, input_features in tqdm(test_loader):
        input_features_1 = input_features['input_features_1'].to(DEVICE)
        input_features_2 = input_features['input_features_2'].to(DEVICE)
        logits = model(input_features_1, input_features_2)
        probs = torch.sigmoid(logits).cpu().numpy()
        preds = (probs > THRESHOLD).astype(bool).tolist()[0]
        results.append([audio_id[0]] + preds)

label_columns = [
    'กล่าวสวัสดี',
    'แนะนำชื่อและนามสกุล',
    'บอกประเภทใบอนุญาตและเลขที่ใบอนุญาตที่ยังไม่หมดอายุ',
    'บอกวัตถุประสงค์ของการเข้าพบครั้งนี้',
    'เน้นประโยชน์ว่าลูกค้าได้ประโยชน์อะไรจากการเข้าพบครั้งนี้',
    'บอกระยะเวลาที่ใช้ในการเข้าพบ'
]

columns = ['id'] + label_columns
submission_df = pd.DataFrame(results, columns=columns)

100%|██████████| 300/300 [06:54<00:00,  1.38s/it]


In [None]:
submission_df.to_csv('LuangPuuThree.csv', index=False)