In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# AutoModelForAudioClassification - pretrained

In [2]:
import os
import random
from glob import glob
from typing import List, Optional, Union, Dict

import tqdm
import torch
import torchaudio
import numpy as np
import pandas as pd
from torch import nn
from torch.utils.data import DataLoader
from torch.nn import functional as F
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    Wav2Vec2Processor
)


In [None]:
import zipfile
import os

# Define paths
train_zip_path = '/content/drive/MyDrive/YoungCon/train.zip'
test_zip_path = '/content/drive/MyDrive/YoungCon/test.zip'
train_extract_path = '/content/train'
test_extract_path = '/content/test'
labels_folder = '/content/labels'

# Unpack train.zip
with zipfile.ZipFile(train_zip_path, 'r') as zip_ref:
    zip_ref.extractall(train_extract_path)

# Unpack test.zip
with zipfile.ZipFile(test_zip_path, 'r') as zip_ref:
    zip_ref.extractall(test_extract_path)

# Move targets.tsv to labels folder
if not os.path.exists(labels_folder):
    os.makedirs(labels_folder)

In [None]:
os.rename(os.path.join('/content/train/train/targets.tsv'), os.path.join(labels_folder, 'targets.tsv'))

In [None]:
import os
import pandas as pd

def check_files_in_targets(train_folder, targets_file):
    """
    Check if every file in the train folder is listed in the targets file.
    Add ".wav" extension to every record in the targets file.

    Args:
        train_folder (str): Path to the train folder containing audio files.
        targets_file (str): Path to the targets.tsv file.

    Returns:
        bool: True if all files in the train folder are listed in the targets file, False otherwise.
        list: List of files in the train folder not listed in the targets file.
    """
    # Read the targets file
    targets_df = pd.read_csv(targets_file, sep='\t')

    # Add ".wav" extension to each record in the targets file
    targets_df.iloc[:, 0] = targets_df.iloc[:, 0].apply(lambda x: f"{x}.wav")

    # Get the list of files in the train folder
    train_files = [f for f in os.listdir(train_folder) if os.path.isfile(os.path.join(train_folder, f))]

    # Get the list of files from the targets file
    target_files = targets_df.iloc[:, 0].tolist()

    # Check for files in the train folder not in the targets file
    missing_files = [f for f in train_files if f not in target_files]

    # Return the result
    if missing_files:
        return False, missing_files
    else:
        return True, []

# Example usage
train_folder = '/content/train/train'
targets_file = '/content/labels/targets.tsv'

all_files_present, missing_files = check_files_in_targets(train_folder, targets_file)
if all_files_present:
    print("All files in the train folder are listed in the targets file.")
else:
    print("The following files in the train folder are not listed in the targets file:")
    for file in missing_files:
        print(file)


The following files in the train folder are not listed in the targets file:
5d1f7e43366513a1d0a6ec5640c3dc24.wav


In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torchaudio
import torch

# Function to delete a specific file
def delete_file(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Deleted file: {file_path}")
    else:
        print(f"File {file_path} not found.")

# Delete the specified file from the train folder
train_audio_dir = '/content/train/train'
file_to_delete = '5d1f7e43366513a1d0a6ec5640c3dc24.wav'
delete_file(os.path.join(train_audio_dir, file_to_delete))

# Load and modify targets
targets_path = '/content/labels/targets.tsv'
labels_df = pd.read_csv(targets_path, sep='\t')
labels_df.iloc[:, 0] = labels_df.iloc[:, 0].apply(lambda x: f"{x}.wav")

Deleted file: /content/train/train/5d1f7e43366513a1d0a6ec5640c3dc24.wav


In [21]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        dataset: List,
        basedir: Optional[str] = None,
        sampling_rate: int = 16000,
        max_audio_len: int = 5,
    ):
        self.dataset = dataset
        self.basedir = basedir

        self.sampling_rate = sampling_rate
        self.max_audio_len = max_audio_len

    def __len__(self):
        """
        Return the length of the dataset
        """
        return len(self.dataset)

    def __getitem__(self, index):
        if self.basedir is None:
            filepath = self.dataset[index]
        else:
            filepath = os.path.join(self.basedir, self.dataset[index])

        speech_array, sr = torchaudio.load(filepath)

        if speech_array.shape[0] > 1:
            speech_array = torch.mean(speech_array, dim=0, keepdim=True)

        if sr != self.sampling_rate:
            transform = torchaudio.transforms.Resample(sr, self.sampling_rate)
            speech_array = transform(speech_array)
            sr = self.sampling_rate

        len_audio = speech_array.shape[1]

        # Pad or truncate the audio to match the desired length
        if len_audio < self.max_audio_len * self.sampling_rate:
            # Pad the audio if it's shorter than the desired length
            padding = torch.zeros(1, self.max_audio_len * self.sampling_rate - len_audio)
            speech_array = torch.cat([speech_array, padding], dim=1)
        else:
            # Truncate the audio if it's longer than the desired length
            speech_array = speech_array[:, :self.max_audio_len * self.sampling_rate]

        speech_array = speech_array.squeeze().numpy()

        return {"input_values": speech_array, "attention_mask": None}


class CollateFunc:
    def __init__(
        self,
        processor: Wav2Vec2Processor,
        padding: Union[bool, str] = True,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: bool = True,
        sampling_rate: int = 16000,
        max_length: Optional[int] = None,
    ):
        self.sampling_rate = sampling_rate
        self.processor = processor
        self.padding = padding
        self.pad_to_multiple_of = pad_to_multiple_of
        self.return_attention_mask = return_attention_mask
        self.max_length = max_length

    def __call__(self, batch: List[Dict[str, np.ndarray]]):
        # Extract input_values from the batch
        input_values = [item["input_values"] for item in batch]

        batch = self.processor(
            input_values,
            sampling_rate=self.sampling_rate,
            return_tensors="pt",
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_attention_mask=self.return_attention_mask
        )

        return {
            "input_values": batch.input_values,
            "attention_mask": batch.attention_mask if self.return_attention_mask else None
        }

In [24]:
import os
from typing import List, Optional, Dict, Union
import torch
import torchaudio
import numpy as np

class CustomDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        dataset: List,
        basedir: Optional[str] = None,
        sampling_rate: int = 16000,
        max_audio_len: int = 5,
    ):
        self.dataset = dataset
        self.basedir = basedir
        self.sampling_rate = sampling_rate
        self.max_audio_len = max_audio_len
        self.audio_file_names = [os.path.basename(f) for f in dataset]  # Store file names

    def __len__(self):
        """
        Return the length of the dataset
        """
        return len(self.dataset)

    def __getitem__(self, index):
        if self.basedir is None:
            filepath = self.dataset[index]
        else:
            filepath = os.path.join(self.basedir, self.dataset[index])

        speech_array, sr = torchaudio.load(filepath)

        if speech_array.shape[0] > 1:
            speech_array = torch.mean(speech_array, dim=0, keepdim=True)

        if sr != self.sampling_rate:
            transform = torchaudio.transforms.Resample(sr, self.sampling_rate)
            speech_array = transform(speech_array)
            sr = self.sampling_rate

        len_audio = speech_array.shape[1]

        # Pad or truncate the audio to match the desired length
        if len_audio < self.max_audio_len * self.sampling_rate:
            # Pad the audio if it's shorter than the desired length
            padding = torch.zeros(1, self.max_audio_len * self.sampling_rate - len_audio)
            speech_array = torch.cat([speech_array, padding], dim=1)
        else:
            # Truncate the audio if it's longer than the desired length
            speech_array = speech_array[:, :self.max_audio_len * self.sampling_rate]

        speech_array = speech_array.squeeze().numpy()
        audio_file_name = self.audio_file_names[index]

        return {"input_values": speech_array, "attention_mask": None, "audio_file_name": audio_file_name}


In [69]:
import torch.nn.functional as F
import tqdm

def predict(test_dataloader, model, device: torch.device):
    """
    Predict the class of the audio and get filenames of uncertain predictions
    """
    model.to(device)
    model.eval()
    preds = []
    uncertain_filenames = []

    with torch.no_grad():
        for batch in tqdm.tqdm(test_dataloader):
            #print(batch['audio_file_name'])
            input_values, attention_mask = batch['input_values'].to(device), batch['attention_mask'].to(device)
            logits = model(input_values, attention_mask=attention_mask).logits
            scores = F.softmax(logits, dim=-1)
            max_scores, pred = torch.max(scores, dim=1)

            # Track filenames with predictions in the confidence range (0.45, 0.55)
            for i in range(len(pred)):
                if 0.01 < max_scores[i].item() < 0.99:
                    print(max_scores[i].item(), batch['audio_file_name'][i])
                    uncertain_filenames.append(batch['audio_file_name'][i])

            preds.extend(pred.cpu().detach().numpy())

    return preds, uncertain_filenames

In [26]:
def get_audio_file_paths(directory, extensions=['.wav', '.mp3', '.flac']):
    audio_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if any(file.lower().endswith(ext) for ext in extensions):
                audio_files.append(os.path.abspath(os.path.join(root, file)))
    return audio_files

In [49]:
# Specify the directory containing the audio files
model_name_or_path = "/content"
directory = '/content/train/train'
audio_paths = get_audio_file_paths(directory)[:1000] # Must be a list with absolute paths of the audios that will be used in inference
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

label2id = {
    "female": 1,
    "male": 0
}

id2label = {
    1: "female",
    0: "male"
}

num_labels = 2

In [36]:
from transformers import Wav2Vec2Processor

class CollateFunc:
    def __init__(
        self,
        processor: Wav2Vec2Processor,
        padding: Union[bool, str] = True,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: bool = True,
        sampling_rate: int = 16000,
        max_length: Optional[int] = None,
    ):
        self.sampling_rate = sampling_rate
        self.processor = processor
        self.padding = padding
        self.pad_to_multiple_of = pad_to_multiple_of
        self.return_attention_mask = return_attention_mask
        self.max_length = max_length

    def __call__(self, batch: List[Dict[str, np.ndarray]]):
        # Extract input_values and audio_file_names from the batch
        input_values = [item["input_values"] for item in batch]
        audio_file_names = [item["audio_file_name"] for item in batch]

        batch = self.processor(
            input_values,
            sampling_rate=self.sampling_rate,
            return_tensors="pt",
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_attention_mask=self.return_attention_mask
        )

        # Return the batch along with audio file names
        return {
            "input_values": batch.input_values,
            "attention_mask": batch.attention_mask if self.return_attention_mask else None,
            "audio_file_name": audio_file_names
        }


In [50]:
test_dataset = CustomDataset(audio_paths, max_audio_len=5)  # for 5-second audio

data_collator = CollateFunc(
    processor=feature_extractor,
    padding=True,
    sampling_rate=16000,
)

test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=16,
    collate_fn=data_collator,
    shuffle=False,
    num_workers=2
)

In [29]:
num_labels = 2

feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path)
model = AutoModelForAudioClassification.from_pretrained(
    pretrained_model_name_or_path=model_name_or_path,
    use_safetensors=True,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of the model checkpoint at /content were not used when initializing Wav2Vec2ForSequenceClassification: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at /content and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']

In [52]:
preds = predict(test_dataloader=test_dataloader, model=model, device=device)

  3%|▎         | 2/63 [00:09<04:08,  4.07s/it]

5536decc9056cd290e16bc3773c24f98.wav


 13%|█▎        | 8/63 [00:15<01:04,  1.17s/it]

9d8d8f1c55253f8b756c6cfe6880a90d.wav


 25%|██▌       | 16/63 [00:23<00:47,  1.01s/it]

6e0e2fdb0bc1fb3d8b38b5eb71808c8a.wav


 33%|███▎      | 21/63 [00:28<00:43,  1.04s/it]

3c79898446b0cb965122339c17ea6e1c.wav


 67%|██████▋   | 42/63 [00:50<00:21,  1.04s/it]

874be2c6eb2123c77d338eb82638f88a.wav


 79%|███████▉  | 50/63 [00:58<00:12,  1.00it/s]

ce82fece3130d24a2717e8832fd36f25.wav


 92%|█████████▏| 58/63 [01:06<00:04,  1.01it/s]

6fce7d9b657c09f7cc8b7fc99eebd93d.wav


100%|██████████| 63/63 [01:10<00:00,  1.12s/it]


In [None]:
np.savetxt("preds_train.npy", np.array(preds))

In [54]:
import os
import pandas as pd

# Path to the targets.tsv file
targets_file_path = "/content/labels/targets.tsv"

# Sample paths to audio files and predictions from the model
#audio_paths = ["/path/to/audio1.wav", "/path/to/audio2.wav"]  # Example list of audio paths
#preds = [1, 0]  # Example predictions, 1 for female, 0 for male

# Load the targets.tsv file into a pandas DataFrame
targets_df = pd.read_csv(targets_file_path, sep='\t', header=None, names=['audio_id', 'true_label'])

# Append ".wav" to audio_id to match the audio file paths
targets_df['audio_id'] = targets_df['audio_id'] + '.wav'

# Create a dictionary from audio_id to true_label
true_labels_dict = dict(zip(targets_df['audio_id'], targets_df['true_label']))

# Initialize counters for correct and total predictions
correct_predictions = 0
total_predictions = len(audio_paths)

# Prepare data for writing to a new file
output_data = []

# Compare predictions with true labels and prepare the output data
for audio_path, pred in zip(audio_paths, preds):
    audio_id = os.path.basename(audio_path)  # Get the audio_id from the file path
    true_label = true_labels_dict.get(audio_id, None)
    if pred==0:
        pred=1
    else:
        pred=0
    if true_label is not None:
        if pred == true_label:
            correct_predictions += 1
        else:
            print(audio_id, true_label)
        # Remove the ".wav" extension
        audio_id_without_extension = os.path.splitext(audio_id)[0]
        output_data.append([audio_id_without_extension, pred])

# Calculate accuracy
accuracy = correct_predictions / total_predictions

# Print the results
print(f"Total predictions: {total_predictions}")
print(f"Correct predictions: {correct_predictions}")
print(f"Accuracy: {accuracy:.2f}")

# Write the output data to a file
output_file_path = "/content/output.tsv"
output_df = pd.DataFrame(output_data, columns=['audio_id', 'pred'])
output_df.to_csv(output_file_path, sep='\t', index=False, header=False)

print(f"Output file saved to: {output_file_path}")

5536decc9056cd290e16bc3773c24f98.wav 1
741e22b830ff92f7c0af06605ab8dfed.wav 1
9d8d8f1c55253f8b756c6cfe6880a90d.wav 0
a235871885a6d84b14d64e339780b00d.wav 0
d8ea8257efbf192f7fd270ec2ff73692.wav 1
21509c8dd2f987bdce3863d1de090c80.wav 1
4c12626db2221dc2ddd7d47caa0dd362.wav 0
b70e9b26685b84191770825d2864b3be.wav 0
02810575edce233e5cf84094e9064d17.wav 1
f9c18bbb063b97077d913c3644d2b4f5.wav 0
f06dad3ad84a40077faedfb8bb86b035.wav 0
48083b0583c332a444d102efad1d1518.wav 1
ce82fece3130d24a2717e8832fd36f25.wav 0
85d0352307efd1eaea120c562c869e06.wav 1
Total predictions: 1000
Correct predictions: 986
Accuracy: 0.99
Output file saved to: /content/output.tsv


## for test

In [77]:
# Specify the directory containing the audio files
model_name_or_path = "/content"
directory = '/content/test/test'
audio_paths = get_audio_file_paths(directory) # Must be a list with absolute paths of the audios that will be used in inference
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

label2id = {
    "female": 1,
    "male": 0
}

id2label = {
    1: "female",
    0: "male"
}

num_labels = 2

In [41]:
num_labels = 2

feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path)
model = AutoModelForAudioClassification.from_pretrained(
    pretrained_model_name_or_path=model_name_or_path,
    use_safetensors=True,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of the model checkpoint at /content were not used when initializing Wav2Vec2ForSequenceClassification: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at /content and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']

In [99]:
test_dataset = CustomDataset(audio_paths, max_audio_len=5)  # for 5-second audio

data_collator = CollateFunc(
    processor=feature_extractor,
    padding=True,
    sampling_rate=16000,
)

test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=1,
    collate_fn=data_collator,
    shuffle=False,
    num_workers=2
)

In [70]:
preds, uncertain_filenames = predict(test_dataloader=test_dataloader, model=model, device=device)

  1%|▏         | 3/214 [00:09<08:46,  2.49s/it]

0.7999496459960938 d5a5ab87926092c416645d33bee193a5.wav


  7%|▋         | 14/214 [00:20<03:21,  1.01s/it]

0.6254121661186218 c2b319ab0bf9704971154340b79b9ce3.wav


 10%|█         | 22/214 [00:28<03:16,  1.02s/it]

0.9846464991569519 9b90b9bde03ef660d65af477e498e6ad.wav


 12%|█▏        | 26/214 [00:32<03:16,  1.04s/it]

0.602676272392273 c87b9cc7b3f8fbde84ceb93c9ca97283.wav


 16%|█▋        | 35/214 [00:42<03:03,  1.03s/it]

0.9759393930435181 1759c61f11986f671031e8d93126793a.wav


 17%|█▋        | 36/214 [00:43<03:01,  1.02s/it]

0.9838926792144775 4e32567c043d6ff2c63744a6e22eb291.wav


 19%|█▊        | 40/214 [00:47<02:55,  1.01s/it]

0.9150171279907227 76422187e39c778e71c655d289587f2e.wav


 23%|██▎       | 50/214 [00:57<02:41,  1.02it/s]

0.9855166077613831 f0aa30658a99d82ac7096d464a66406b.wav


 28%|██▊       | 59/214 [01:05<02:30,  1.03it/s]

0.9458574056625366 ec7ac8b411f14ecc398b915521b3e63e.wav


 30%|██▉       | 64/214 [01:10<02:25,  1.03it/s]

0.9447989463806152 e7e10da633145756a79c39409fd16263.wav


 43%|████▎     | 93/214 [01:38<01:58,  1.02it/s]

0.9847645163536072 78b2f161520ea69b74e9061c0503ff00.wav


 44%|████▍     | 94/214 [01:39<01:57,  1.02it/s]

0.9876993894577026 23eb3ab33bf2066d3f95d9922b6a77a7.wav


 51%|█████     | 109/214 [01:54<01:43,  1.02it/s]

0.8612250685691833 b04c4b0c90c3bbadbd4b6c4294edf312.wav


 57%|█████▋    | 121/214 [02:06<01:31,  1.02it/s]

0.6437684297561646 89587d0b96b84797fddf679a4341f197.wav


 64%|██████▎   | 136/214 [02:21<01:16,  1.02it/s]

0.928317129611969 0ea9f37dbef6d9bc98c7afda4cffc51e.wav


 69%|██████▊   | 147/214 [02:31<01:05,  1.02it/s]

0.7563365697860718 87a9f759db5b1a8c8e6c259535ab2d1c.wav


 74%|███████▍  | 158/214 [02:42<00:54,  1.03it/s]

0.9277634620666504 3745b62e313c79963ec0de66df55ab22.wav


 93%|█████████▎| 199/214 [03:22<00:14,  1.02it/s]

0.9884082674980164 16013e5be8c44f2240cfa4275af88bb3.wav


100%|█████████▉| 213/214 [03:36<00:00,  1.02it/s]

0.9365003108978271 50230f9aab6da4fcf197a72e8e1573ea.wav


100%|██████████| 214/214 [03:36<00:00,  1.01s/it]


In [75]:
output_data = []

uncertain_filenames2 = ['d5a5ab87926092c416645d33bee193a5.wav', 'c2b319ab0bf9704971154340b79b9ce3.wav', 'c87b9cc7b3f8fbde84ceb93c9ca97283.wav']
for audio_path, pred in zip(audio_paths, preds):
    audio_id = os.path.basename(audio_path)
    audio_id_without_extension = os.path.splitext(audio_id)[0]
    if pred==0:
        pred=1
    else:
        pred=0
    output_data.append([audio_id_without_extension, pred])

# Write the output data to a file
output_file_path = "/content/output23.tsv"
output_df = pd.DataFrame(output_data, columns=['audio_id', 'pred'])
output_df.to_csv(output_file_path, sep='\t', index=False, header=False)

print(f"Output file saved to: {output_file_path}")

Output file saved to: /content/output23.tsv
