In [5]:
import os
import pandas as pd
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor, TrainingArguments, Trainer
from sklearn.preprocessing import LabelEncoder
from transformers import AdamW, get_scheduler

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the Wav2Vec2 processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

# Fixed audio length (e.g., 10 seconds)
fixed_length = 10 * 16000  # 10 seconds * 16000 Hz

# Custom dataset class
class BirdSoundDataset(Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)
        self.label_encoder = LabelEncoder()
        self.data["Encoded Labels"] = self.label_encoder.fit_transform(self.data["Common Name"])
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        file_path = self.data.iloc[idx]["File Path"]
        label = self.data.iloc[idx]["Encoded Labels"]
        
        # Load and preprocess the audio file
        waveform, sample_rate = torchaudio.load(file_path)
        
        # Ensure the audio is mono
        if waveform.size(0) > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        # Ensure the audio is exactly 10 seconds
        if waveform.size(1) > fixed_length:
            waveform = waveform[:, :fixed_length]
        else:
            padding = fixed_length - waveform.size(1)
            waveform = torch.nn.functional.pad(waveform, (0, padding))
        
        inputs = processor(waveform.squeeze(0), sampling_rate=16000, return_tensors="pt", padding=True)
        
        return {"input_values": inputs.input_values.squeeze(0), "labels": torch.tensor(label)}

# Load the data to fit the label encoder
train_df = pd.read_csv("dataset/train_wav.csv")
test_df = pd.read_csv("dataset/test_wav.csv")
combined_df = pd.concat([train_df, test_df])

# Encode the labels
label_encoder = LabelEncoder()
combined_df["Encoded Labels"] = label_encoder.fit_transform(combined_df["Common Name"])

# Save the encoded labels back to the CSV files
train_df["Encoded Labels"] = label_encoder.transform(train_df["Common Name"])
test_df["Encoded Labels"] = label_encoder.transform(test_df["Common Name"])
train_df.to_csv("dataset/train_final.csv", index=False)
test_df.to_csv("dataset/test_final.csv", index=False)

# Load the Wav2Vec2 model with the number of labels
model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-base-960h", num_labels=len(label_encoder.classes_)).to(device)

# Load datasets
train_dataset = BirdSoundDataset(csv_file="dataset/train_final.csv")
test_dataset = BirdSoundDataset(csv_file="dataset/test_final.csv")

# Custom collate function to handle padding
def collate_fn(batch):
    input_values = [item['input_values'].squeeze(0) for item in batch]  # Remove the channel dimension
    labels = torch.tensor([item['labels'] for item in batch], dtype=torch.long)  # Convert labels to LongTensor
    
    # Pad the input values
    input_values_padded = torch.nn.utils.rnn.pad_sequence(input_values, batch_first=True, padding_value=0.0)
    return {"input_values": input_values_padded, "labels": labels}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,  # Initial learning rate
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,
)

# Initialize optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)
num_training_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs
lr_scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=processor.feature_extractor,
    data_collator=collate_fn,
    optimizers=(optimizer, lr_scheduler),  # Pass the optimizer and scheduler
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./fine_tuned_model")
processor.save_pretrained("./fine_tuned_model")

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,6.801,6.682422
2,6.6311,6.678906
3,6.5184,6.682031


[]

In [12]:
# Save the model and processor
trainer.save_model("./fine_tuned_model")
processor.save_pretrained("./fine_tuned_model")

[]

In [6]:
import torch

# Clear GPU cache
torch.cuda.empty_cache()


In [3]:
import os
import pandas as pd
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the processor and the fine-tuned model
processor = Wav2Vec2Processor.from_pretrained("./fine_tuned_model")
model = Wav2Vec2ForSequenceClassification.from_pretrained("./fine_tuned_model").to(device)

# Fixed audio length (e.g., 10 seconds)
fixed_length = 10 * 16000  # 10 seconds * 16000 Hz

# Custom dataset class
class BirdSoundDataset(Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)
        self.label_encoder = LabelEncoder()
        self.data["Encoded Labels"] = self.label_encoder.fit_transform(self.data["Common Name"])
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        file_path = self.data.iloc[idx]["File Path"]
        label = self.data.iloc[idx]["Encoded Labels"]
        
        # Load and preprocess the audio file
        waveform, sample_rate = torchaudio.load(file_path)
        
        # Ensure the audio is mono
        if waveform.size(0) > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        # Ensure the audio is exactly 10 seconds
        if waveform.size(1) > fixed_length:
            waveform = waveform[:, :fixed_length]
        else:
            padding = fixed_length - waveform.size(1)
            waveform = torch.nn.functional.pad(waveform, (0, padding))
        
        inputs = processor(waveform.squeeze(0), sampling_rate=16000, return_tensors="pt", padding=True)
        
        return {"input_values": inputs.input_values.squeeze(0), "labels": torch.tensor(label)}

# Load the test dataset
# Load the data to fit the label encoder
train_df = pd.read_csv("dataset/train_wav.csv")
test_df = pd.read_csv("dataset/test_wav.csv")
combined_df = pd.concat([train_df, test_df])

# Encode the labels
label_encoder = LabelEncoder()
combined_df["Encoded Labels"] = label_encoder.fit_transform(combined_df["Common Name"])

test_dataset = BirdSoundDataset(csv_file="dataset/test_final.csv")

# Custom collate function to handle padding
def collate_fn(batch):
    input_values = [item['input_values'].squeeze(0) for item in batch]  # Remove the channel dimension
    labels = torch.tensor([item['labels'] for item in batch], dtype=torch.long)  # Convert labels to LongTensor
    
    # Pad the input values
    input_values_padded = torch.nn.utils.rnn.pad_sequence(input_values, batch_first=True, padding_value=0.0)
    return {"input_values": input_values_padded, "labels": labels}

# Create DataLoader for test dataset
test_loader = DataLoader(test_dataset, batch_size=8, collate_fn=collate_fn)

# Function to evaluate the model
def evaluate_model(model, data_loader):
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_values = batch["input_values"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_values).logits
            preds = torch.argmax(outputs, dim=-1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    return predictions, true_labels

# Evaluate the model on the test set
predictions, true_labels = evaluate_model(model, test_loader)

# Decode the labels
label_encoder = test_dataset.label_encoder
print(predictions)
decoded_predictions = label_encoder.inverse_transform(predictions)
decoded_true_labels = label_encoder.inverse_transform(true_labels)

# Print the predicted and true labels
for i in range(len(decoded_true_labels)):
    print(f"True label: {decoded_true_labels[i]}, Predicted label: {decoded_predictions[i]}")

# Calculate accuracy
accuracy = (decoded_predictions == decoded_true_labels).mean()
print(f"Test Accuracy: {accuracy:.4f}")

# Create a DataFrame to save the results
results_df = pd.DataFrame({
    "File Path": test_dataset.data["File Path"],
    "True Label": decoded_true_labels,
    "Predicted Label": decoded_predictions
})

# Save the results to a CSV file
results_df.to_csv("test_results.csv", index=False)
print("Results saved to test_results.csv")


[958, 958, 958, 958, 958, 958, 958, 958, 958, 958]


ValueError: y contains previously unseen labels: [958]