In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Preparation

In [None]:
import zipfile
import os

# Define paths
train_zip_path = '/content/drive/MyDrive/YoungCon/train.zip'
test_zip_path = '/content/drive/MyDrive/YoungCon/test.zip'
train_extract_path = '/content/train'
test_extract_path = '/content/test'
labels_folder = '/content/labels'

# Unpack train.zip
with zipfile.ZipFile(train_zip_path, 'r') as zip_ref:
    zip_ref.extractall(train_extract_path)

# Unpack test.zip
with zipfile.ZipFile(test_zip_path, 'r') as zip_ref:
    zip_ref.extractall(test_extract_path)

# Move targets.tsv to labels folder
if not os.path.exists(labels_folder):
    os.makedirs(labels_folder)

In [None]:
os.rename(os.path.join('/content/train/train/targets.tsv'), os.path.join(labels_folder, 'targets.tsv'))

In [None]:
import os
import pandas as pd

def check_files_in_targets(train_folder, targets_file):
    """
    Check if every file in the train folder is listed in the targets file.
    Add ".wav" extension to every record in the targets file.

    Args:
        train_folder (str): Path to the train folder containing audio files.
        targets_file (str): Path to the targets.tsv file.

    Returns:
        bool: True if all files in the train folder are listed in the targets file, False otherwise.
        list: List of files in the train folder not listed in the targets file.
    """
    # Read the targets file
    targets_df = pd.read_csv(targets_file, sep='\t')

    # Add ".wav" extension to each record in the targets file
    targets_df.iloc[:, 0] = targets_df.iloc[:, 0].apply(lambda x: f"{x}.wav")

    # Get the list of files in the train folder
    train_files = [f for f in os.listdir(train_folder) if os.path.isfile(os.path.join(train_folder, f))]

    # Get the list of files from the targets file
    target_files = targets_df.iloc[:, 0].tolist()

    # Check for files in the train folder not in the targets file
    missing_files = [f for f in train_files if f not in target_files]

    # Return the result
    if missing_files:
        return False, missing_files
    else:
        return True, []

# Example usage
train_folder = '/content/train/train'
targets_file = '/content/labels/targets.tsv'

all_files_present, missing_files = check_files_in_targets(train_folder, targets_file)
if all_files_present:
    print("All files in the train folder are listed in the targets file.")
else:
    print("The following files in the train folder are not listed in the targets file:")
    for file in missing_files:
        print(file)


The following files in the train folder are not listed in the targets file:
5d1f7e43366513a1d0a6ec5640c3dc24.wav


In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torchaudio
import torch

# Function to delete a specific file
def delete_file(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Deleted file: {file_path}")
    else:
        print(f"File {file_path} not found.")

# Delete the specified file from the train folder
train_audio_dir = '/content/train/train'
file_to_delete = '5d1f7e43366513a1d0a6ec5640c3dc24.wav'
delete_file(os.path.join(train_audio_dir, file_to_delete))

# Load and modify targets
targets_path = '/content/labels/targets.tsv'
labels_df = pd.read_csv(targets_path, sep='\t')
labels_df.iloc[:, 0] = labels_df.iloc[:, 0].apply(lambda x: f"{x}.wav")

Deleted file: /content/train/train/5d1f7e43366513a1d0a6ec5640c3dc24.wav


# Pretrained model validation

In [None]:
import os
import random
from glob import glob
from typing import List, Optional, Union, Dict

import tqdm
import torch
import torchaudio
import numpy as np
import pandas as pd
from torch import nn
from torch.utils.data import DataLoader
from torch.nn import functional as F
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    Wav2Vec2Processor
)


In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        dataset: List,
        basedir: Optional[str] = None,
        sampling_rate: int = 16000,
        max_audio_len: int = 5,
    ):
        self.dataset = dataset
        self.basedir = basedir

        self.sampling_rate = sampling_rate
        self.max_audio_len = max_audio_len

    def __len__(self):
        """
        Return the length of the dataset
        """
        return len(self.dataset)

    def __getitem__(self, index):
        if self.basedir is None:
            filepath = self.dataset[index]
        else:
            filepath = os.path.join(self.basedir, self.dataset[index])

        speech_array, sr = torchaudio.load(filepath)

        if speech_array.shape[0] > 1:
            speech_array = torch.mean(speech_array, dim=0, keepdim=True)

        if sr != self.sampling_rate:
            transform = torchaudio.transforms.Resample(sr, self.sampling_rate)
            speech_array = transform(speech_array)
            sr = self.sampling_rate

        len_audio = speech_array.shape[1]

        # Pad or truncate the audio to match the desired length
        if len_audio < self.max_audio_len * self.sampling_rate:
            # Pad the audio if it's shorter than the desired length
            padding = torch.zeros(1, self.max_audio_len * self.sampling_rate - len_audio)
            speech_array = torch.cat([speech_array, padding], dim=1)
        else:
            # Truncate the audio if it's longer than the desired length
            speech_array = speech_array[:, :self.max_audio_len * self.sampling_rate]

        speech_array = speech_array.squeeze().numpy()

        return {"input_values": speech_array, "attention_mask": None}


class CollateFunc:
    def __init__(
        self,
        processor: Wav2Vec2Processor,
        padding: Union[bool, str] = True,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: bool = True,
        sampling_rate: int = 16000,
        max_length: Optional[int] = None,
    ):
        self.sampling_rate = sampling_rate
        self.processor = processor
        self.padding = padding
        self.pad_to_multiple_of = pad_to_multiple_of
        self.return_attention_mask = return_attention_mask
        self.max_length = max_length

    def __call__(self, batch: List[Dict[str, np.ndarray]]):
        # Extract input_values from the batch
        input_values = [item["input_values"] for item in batch]

        batch = self.processor(
            input_values,
            sampling_rate=self.sampling_rate,
            return_tensors="pt",
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_attention_mask=self.return_attention_mask
        )

        return {
            "input_values": batch.input_values,
            "attention_mask": batch.attention_mask if self.return_attention_mask else None
        }

In [None]:
def predict(test_dataloader, model, device: torch.device):
    """
    Predict the class of the audio
    """
    model.to(device)
    model.eval()
    preds = []

    with torch.no_grad():
        for batch in tqdm.tqdm(test_dataloader):
            input_values, attention_mask = batch['input_values'].to(device), batch['attention_mask'].to(device)

            logits = model(input_values, attention_mask=attention_mask).logits
            scores = F.softmax(logits, dim=-1)

            pred = torch.argmax(scores, dim=1).cpu().detach().numpy()

            preds.extend(pred)

    return preds


def get_gender(model_name_or_path: str, audio_paths: List[str], label2id: Dict, id2label: Dict, device: torch.device):
    num_labels = 2

    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path)
    model = AutoModelForAudioClassification.from_pretrained(
        pretrained_model_name_or_path=model_name_or_path,
        use_safetensors=True,
        num_labels=num_labels,
        label2id=label2id,
        id2label=id2label,
    )

    test_dataset = CustomDataset(audio_paths, max_audio_len=5)  # for 5-second audio

    data_collator = CollateFunc(
        processor=feature_extractor,
        padding=True,
        sampling_rate=16000,
    )

    test_dataloader = DataLoader(
        dataset=test_dataset,
        batch_size=16,
        collate_fn=data_collator,
        shuffle=False,
        num_workers=2
    )

    preds = predict(test_dataloader=test_dataloader, model=model, device=device)

    return preds

def get_audio_file_paths(directory, extensions=['.wav', '.mp3', '.flac']):
    audio_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if any(file.lower().endswith(ext) for ext in extensions):
                audio_files.append(os.path.abspath(os.path.join(root, file)))
    return audio_files

In [None]:
# Specify the directory containing the audio files
model_name_or_path = "/content"
directory = '/content/train/train'
audio_paths = get_audio_file_paths(directory) # Must be a list with absolute paths of the audios that will be used in inference
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

label2id = {
    "female": 1,
    "male": 0
}

id2label = {
    1: "female",
    0: "male"
}

num_labels = 2

In [None]:
num_labels = 2

feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path)
model = AutoModelForAudioClassification.from_pretrained(
    pretrained_model_name_or_path=model_name_or_path,
    use_safetensors=True,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

In [None]:
test_dataset = CustomDataset(audio_paths, max_audio_len=5)  # for 5-second audio

data_collator = CollateFunc(
    processor=feature_extractor,
    padding=True,
    sampling_rate=16000,
)

test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=128,
    collate_fn=data_collator,
    shuffle=False,
    num_workers=2
)

In [None]:
preds = predict(test_dataloader=test_dataloader, model=model, device=device)

100%|██████████| 871/871 [13:45<00:00,  1.05it/s]


In [None]:
np.savetxt("preds_train.npy", np.array(preds))

In [None]:
import os
import pandas as pd

targets_file_path = "/content/labels/targets.tsv"
targets_df = pd.read_csv(targets_file_path, sep='\t', header=None, names=['audio_id', 'true_label'])
targets_df['audio_id'] = targets_df['audio_id'] + '.wav'
true_labels_dict = dict(zip(targets_df['audio_id'], targets_df['true_label']))


correct_predictions = 0
total_predictions = len(audio_paths)
output_data = []

for audio_path, pred in zip(audio_paths, preds):
    audio_id = os.path.basename(audio_path)
    true_label = true_labels_dict.get(audio_id, None)
    if pred==0:
        pred=1
    else:
        pred=0
    if true_label is not None:
        if pred == true_label:
            correct_predictions += 1
        # Remove the ".wav" extension
        audio_id_without_extension = os.path.splitext(audio_id)[0]
        output_data.append([audio_id_without_extension, pred])

# Calculate accuracy
accuracy = correct_predictions / total_predictions
print(f"Total predictions: {total_predictions}")
print(f"Correct predictions: {correct_predictions}")
print(f"Accuracy: {accuracy:.2f}")

output_file_path = "/content/output.tsv"
output_df = pd.DataFrame(output_data, columns=['audio_id', 'pred'])
output_df.to_csv(output_file_path, sep='\t', index=False, header=False)

Total predictions: 13935
Correct predictions: 13604
Accuracy: 0.98
Output file saved to: /content/output.tsv


## for test

In [None]:
# Specify the directory containing the audio files
model_name_or_path = "/content"
directory = '/content/test/test'
audio_paths = get_audio_file_paths(directory) # Must be a list with absolute paths of the audios that will be used in inference
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

label2id = {
    "female": 1,
    "male": 0
}

id2label = {
    1: "female",
    0: "male"
}

num_labels = 2

In [None]:
num_labels = 2

feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path)
model = AutoModelForAudioClassification.from_pretrained(
    pretrained_model_name_or_path=model_name_or_path,
    use_safetensors=True,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of the model checkpoint at /content were not used when initializing Wav2Vec2ForSequenceClassification: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at /content and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']

In [None]:
test_dataset = CustomDataset(audio_paths, max_audio_len=5)  # for 5-second audio

data_collator = CollateFunc(
    processor=feature_extractor,
    padding=True,
    sampling_rate=16000,
)

test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=16,
    collate_fn=data_collator,
    shuffle=False,
    num_workers=2
)

In [None]:
preds = predict(test_dataloader=test_dataloader, model=model, device=device)

100%|██████████| 214/214 [03:30<00:00,  1.02it/s]


In [None]:
np.savetxt("preds_test.npy", np.array(preds))

In [None]:
import os
import pandas as pd

output_data = []

# Compare predictions with true labels and prepare the output data
for audio_path, pred in zip(audio_paths, preds):
    audio_id = os.path.basename(audio_path)
    if pred==0:
        pred=1
    else:
        pred=0
    audio_id_without_extension = os.path.splitext(audio_id)[0]
    output_data.append([audio_id_without_extension, pred])

output_file_path = "/content/output4.tsv"
output_df = pd.DataFrame(output_data, columns=['audio_id', 'pred'])
output_df.to_csv(output_file_path, sep='\t', index=False, header=False)

print(f"Output file saved to: {output_file_path}")

Output file saved to: /content/output4.tsv


# Training Loop

In [None]:
import os
import torch
import torchaudio
import torch.nn.functional as F
import pandas as pd
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import Wav2Vec2Processor, AutoModelForAudioClassification, AutoFeatureExtractor, get_scheduler
from torch.optim import Adam
from typing import List, Optional, Union, Dict
import numpy as np
import tqdm
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from torch.cuda.amp import autocast, GradScaler

# Set the seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

class CustomDataset(Dataset):
    def __init__(self, dataset: List, labels: List, basedir: Optional[str] = None, sampling_rate: int = 16000, max_audio_len: int = 5):
        self.dataset = dataset
        self.labels = labels
        self.basedir = basedir
        self.sampling_rate = sampling_rate
        self.max_audio_len = max_audio_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        if self.basedir is None:
            filepath = self.dataset[index]
        else:
            filepath = os.path.join(self.basedir, self.dataset[index])

        speech_array, sr = torchaudio.load(filepath)
        if speech_array.shape[0] > 1:
            speech_array = torch.mean(speech_array, dim=0, keepdim=True)

        if sr != self.sampling_rate:
            transform = torchaudio.transforms.Resample(sr, self.sampling_rate)
            speech_array = transform(speech_array)
            sr = self.sampling_rate

        len_audio = speech_array.shape[1]
        if len_audio < self.max_audio_len * self.sampling_rate:
            padding = torch.zeros(1, self.max_audio_len * self.sampling_rate - len_audio)
            speech_array = torch.cat([speech_array, padding], dim=1)
        else:
            speech_array = speech_array[:, :self.max_audio_len * self.sampling_rate]

        speech_array = speech_array.squeeze().numpy()
        label = self.labels[index]

        return {"input_values": speech_array, "attention_mask": None, "labels": label}

class CollateFunc:
    def __init__(self, processor: Wav2Vec2Processor, padding: Union[bool, str] = True, pad_to_multiple_of: Optional[int] = None,
                 return_attention_mask: bool = True, sampling_rate: int = 16000, max_length: Optional[int] = None):
        self.sampling_rate = sampling_rate
        self.processor = processor
        self.padding = padding
        self.pad_to_multiple_of = pad_to_multiple_of
        self.return_attention_mask = return_attention_mask
        self.max_length = max_length

    def __call__(self, batch: List[Dict[str, np.ndarray]]):
        input_values = [item["input_values"] for item in batch]
        labels = [item["labels"] for item in batch]

        batch = self.processor(
            input_values,
            sampling_rate=self.sampling_rate,
            return_tensors="pt",
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_attention_mask=self.return_attention_mask
        )

        batch["labels"] = torch.tensor(labels, dtype=torch.long)
        return batch

def train(train_dataloader, model, optimizer, lr_scheduler, device, gradient_accumulation_steps):
    model.train()
    total_loss = 0.0
    all_preds = []
    all_labels = []
    scaler = GradScaler()

    for batch_idx, batch in enumerate(tqdm.tqdm(train_dataloader)):
        optimizer.zero_grad()  # Zero the gradients
        input_values = batch['input_values'].to(device, dtype=torch.float16)
        attention_mask = batch['attention_mask'].to(device, dtype=torch.float16)
        labels = batch['labels'].to(device)

        # Debugging statements
        #print(f"Batch {batch_idx} - Input values shape: {input_values.shape}, Attention mask shape: {attention_mask.shape}, Labels shape: {labels.shape}")

        with autocast():
            outputs = model(input_values, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / gradient_accumulation_steps

        scaler.scale(loss).backward()

        if (batch_idx + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            lr_scheduler.step()
            optimizer.zero_grad()  # Ensure gradients are zeroed after the update

        total_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(train_dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    return avg_loss, accuracy

def evaluate(eval_dataloader, model, device):
    model.eval()
    total_eval_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm.tqdm(eval_dataloader):
            input_values = batch['input_values'].to(device, dtype=torch.float16)
            attention_mask = batch['attention_mask'].to(device, dtype=torch.float16)
            labels = batch['labels'].to(device)

            # Debugging statements
            #print(f"Eval - Input values shape: {input_values.shape}, Attention mask shape: {attention_mask.shape}, Labels shape: {labels.shape}")

            with autocast():
                outputs = model(input_values, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            total_eval_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_eval_loss = total_eval_loss / len(eval_dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    return avg_eval_loss, accuracy

# Function to plot training and validation loss and accuracy
def plot_metrics(train_losses, val_losses, train_accuracies, val_accuracies):
    epochs = range(1, len(train_losses) + 1)

    plt.figure(figsize=(14, 5))

    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, 'b', label='Training Loss')
    plt.plot(epochs, val_losses, 'r', label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_accuracies, 'b', label='Training Accuracy')
    plt.plot(epochs, val_accuracies, 'r', label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.show()

# Initialize model, processor, dataloaders, etc.
model_name_or_path = "/content"
directory = '/content/train/train'
audio_paths = [os.path.join(directory, fname) for fname in os.listdir(directory) if fname.endswith('.wav')]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Path to the targets.tsv file
targets_file_path = "/content/labels/targets.tsv"

# Load the targets.tsv file into a pandas DataFrame
targets_df = pd.read_csv(targets_file_path, sep='\t', header=None, names=['audio_id', 'true_label'])

# Append ".wav" to audio_id to match the audio file paths
targets_df['audio_id'] = targets_df['audio_id'] + '.wav'

# Create a dictionary from audio_id to true_label
labels = targets_df['true_label'].map({0: 1, 1: 0}).tolist()

label2id = {"female": 1, "male": 0}
id2label = {1: "female", 0: "male"}

num_labels = len(label2id)

feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path)
model = AutoModelForAudioClassification.from_pretrained(
    pretrained_model_name_or_path=model_name_or_path,
    use_safetensors=True,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
).to(device)

# Split the dataset into training and evaluation
dataset = CustomDataset(audio_paths[:1000], labels, max_audio_len=5)
train_size = int(0.8 * len(dataset))
eval_size = len(dataset) - train_size
train_dataset, eval_dataset = random_split(dataset, [train_size, eval_size])

train_dataloader = DataLoader(dataset=train_dataset, batch_size=2, collate_fn=CollateFunc(processor=feature_extractor, padding=True, sampling_rate=16000), shuffle=True, num_workers=2)
eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=2, collate_fn=CollateFunc(processor=feature_extractor, padding=True, sampling_rate=16000), shuffle=False, num_workers=2)

##############################################

class CustomDataset2(torch.utils.data.Dataset):
    def __init__(
        self,
        dataset: List,
        basedir: Optional[str] = None,
        sampling_rate: int = 16000,
        max_audio_len: int = 5,
    ):
        self.dataset = dataset
        self.basedir = basedir

        self.sampling_rate = sampling_rate
        self.max_audio_len = max_audio_len

    def __len__(self):
        """
        Return the length of the dataset
        """
        return len(self.dataset)

    def __getitem__(self, index):
        if self.basedir is None:
            filepath = self.dataset[index]
        else:
            filepath = os.path.join(self.basedir, self.dataset[index])

        speech_array, sr = torchaudio.load(filepath)

        if speech_array.shape[0] > 1:
            speech_array = torch.mean(speech_array, dim=0, keepdim=True)

        if sr != self.sampling_rate:
            transform = torchaudio.transforms.Resample(sr, self.sampling_rate)
            speech_array = transform(speech_array)
            sr = self.sampling_rate

        len_audio = speech_array.shape[1]

        # Pad or truncate the audio to match the desired length
        if len_audio < self.max_audio_len * self.sampling_rate:
            # Pad the audio if it's shorter than the desired length
            padding = torch.zeros(1, self.max_audio_len * self.sampling_rate - len_audio)
            speech_array = torch.cat([speech_array, padding], dim=1)
        else:
            # Truncate the audio if it's longer than the desired length
            speech_array = speech_array[:, :self.max_audio_len * self.sampling_rate]

        speech_array = speech_array.squeeze().numpy()

        return {"input_values": speech_array, "attention_mask": None}

def predict(test_dataloader, model, device: torch.device):
    """
    Predict the class of the audio
    """
    model.to(device)
    model.eval()
    preds = []

    with torch.no_grad():
        for batch in tqdm.tqdm(test_dataloader):
            input_values, attention_mask = batch['input_values'].to(device), batch['attention_mask'].to(device)

            logits = model(input_values, attention_mask=attention_mask).logits
            scores = F.softmax(logits, dim=-1)

            pred = torch.argmax(scores, dim=1).cpu().detach().numpy()

            preds.extend(pred)

    return preds

def get_audio_file_paths(directory, extensions=['.wav', '.mp3', '.flac']):
    audio_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if any(file.lower().endswith(ext) for ext in extensions):
                audio_files.append(os.path.abspath(os.path.join(root, file)))
    return audio_files

audio_paths_test = get_audio_file_paths('/content/test/test')
test_dataset = CustomDataset2(audio_paths_test, max_audio_len=5)

data_collator = CollateFunc(
    processor=feature_extractor,
    padding=True,
    sampling_rate=16000,
)

test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=16,
    collate_fn=data_collator,
    shuffle=False,
    num_workers=2
)
#######################################################

# Optimizer and scheduler
optimizer = Adam(model.parameters(), lr=5e-5)
num_training_steps = len(train_dataloader) * 10  # Assuming 10 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training loop
num_epochs = 10
gradient_accumulation_steps = 1
train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []

for epoch in range(num_epochs):
    train_loss, train_acc = train(train_dataloader, model, optimizer, lr_scheduler, device, gradient_accumulation_steps)
    val_loss, val_acc = evaluate(eval_dataloader, model, device)

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}")

    preds = predict(test_dataloader=test_dataloader, model=model, device=device)
    output_data = []
    # Compare predictions with true labels and prepare the output data
    for audio_path, pred in zip(audio_paths_test, preds):
        audio_id = os.path.basename(audio_path)  # Get the audio_id from the file path
        if pred==0:
            pred=1
        else:
            pred=0
        audio_id_without_extension = os.path.splitext(audio_id)[0]
        output_data.append([audio_id_without_extension, pred])

    output_df = pd.DataFrame(output_data, columns=['audio_id', 'pred'])
    output_df.to_csv(f"/content/answer{epoch}.tsv", sep='\t', index=False, header=False)

    model.save_pretrained(f"/content/epoch_{epoch}.pt")


Some weights of the model checkpoint at /content were not used when initializing Wav2Vec2ForSequenceClassification: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at /content and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']

Epoch 1/10
Train Loss: 0.7743, Train Accuracy: 0.5088
Val Loss: 0.6953, Val Accuracy: 0.5150


  0%|          | 0/214 [00:00<?, ?it/s]


KeyError: Caught KeyError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "<ipython-input-4-a8c647be100f>", line 70, in __call__
    labels = [item["labels"] for item in batch]
  File "<ipython-input-4-a8c647be100f>", line 70, in <listcomp>
    labels = [item["labels"] for item in batch]
KeyError: 'labels'
