In [5]:
import os
import pandas as pd
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor, TrainingArguments, Trainer
from sklearn.preprocessing import LabelEncoder
from transformers import AdamW, get_scheduler

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the Wav2Vec2 processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

# Fixed audio length (e.g., 10 seconds)
fixed_length = 10 * 16000  # 10 seconds * 16000 Hz

# Custom dataset class
class BirdSoundDataset(Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)
        self.label_encoder = LabelEncoder()
        self.data["Encoded Labels"] = self.label_encoder.fit_transform(self.data["Common Name"])
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        file_path = self.data.iloc[idx]["File Path"]
        label = self.data.iloc[idx]["Encoded Labels"]
        
        # Load and preprocess the audio file
        waveform, sample_rate = torchaudio.load(file_path)
        
        # Ensure the audio is mono
        if waveform.size(0) > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        # Ensure the audio is exactly 10 seconds
        if waveform.size(1) > fixed_length:
            waveform = waveform[:, :fixed_length]
        else:
            padding = fixed_length - waveform.size(1)
            waveform = torch.nn.functional.pad(waveform, (0, padding))
        
        inputs = processor(waveform.squeeze(0), sampling_rate=16000, return_tensors="pt", padding=True)
        
        return {"input_values": inputs.input_values.squeeze(0), "labels": torch.tensor(label)}

# Load the data to fit the label encoder
train_df = pd.read_csv("dataset/train_wav.csv")
test_df = pd.read_csv("dataset/test_wav.csv")
combined_df = pd.concat([train_df, test_df])

# Encode the labels
label_encoder = LabelEncoder()
combined_df["Encoded Labels"] = label_encoder.fit_transform(combined_df["Common Name"])

# Save the encoded labels back to the CSV files
train_df["Encoded Labels"] = label_encoder.transform(train_df["Common Name"])
test_df["Encoded Labels"] = label_encoder.transform(test_df["Common Name"])
train_df.to_csv("dataset/train_final.csv", index=False)
test_df.to_csv("dataset/test_final.csv", index=False)

# Load the Wav2Vec2 model with the number of labels
model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-base-960h", num_labels=len(label_encoder.classes_)).to(device)

# Load datasets
train_dataset = BirdSoundDataset(csv_file="dataset/train_final.csv")
test_dataset = BirdSoundDataset(csv_file="dataset/test_final.csv")

# Custom collate function to handle padding
def collate_fn(batch):
    input_values = [item['input_values'].squeeze(0) for item in batch]  # Remove the channel dimension
    labels = torch.tensor([item['labels'] for item in batch], dtype=torch.long)  # Convert labels to LongTensor
    
    # Pad the input values
    input_values_padded = torch.nn.utils.rnn.pad_sequence(input_values, batch_first=True, padding_value=0.0)
    return {"input_values": input_values_padded, "labels": labels}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,  # Initial learning rate
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,
)

# Initialize optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)
num_training_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs
lr_scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=processor.feature_extractor,
    data_collator=collate_fn,
    optimizers=(optimizer, lr_scheduler),  # Pass the optimizer and scheduler
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./fine_tuned_model")
processor.save_pretrained("./fine_tuned_model")

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,6.801,6.682422
2,6.6311,6.678906
3,6.5184,6.682031


[]

In [12]:
# Save the model and processor
trainer.save_model("./fine_tuned_model")
processor.save_pretrained("./fine_tuned_model")

[]

In [6]:
import torch

# Clear GPU cache
torch.cuda.empty_cache()


In [27]:
import os
import pandas as pd
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import joblib
import xgboost as xgb
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a custom dataset class
class BirdSoundDataset(Dataset):
    def __init__(self, csv_file, processor, label_encoder):
        self.data = pd.read_csv(csv_file)
        self.processor = processor
        self.label_encoder = label_encoder
        self.data["Encoded Labels"] = self.label_encoder.transform(self.data["Common Name"])
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        file_path = self.data.iloc[idx]["File Path"]
        label = self.data.iloc[idx]["Encoded Labels"]
        
        # Load and preprocess the audio file
        waveform, sample_rate = torchaudio.load(file_path)
        
        # Ensure the audio is mono
        if waveform.size(0) > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        # Ensure the audio is exactly 10 seconds (or pad if shorter)
        fixed_length = 10 * 16000  # 10 seconds * 16000 Hz
        if waveform.size(1) > fixed_length:
            waveform = waveform[:, :fixed_length]
        else:
            padding = fixed_length - waveform.size(1)
            waveform = torch.nn.functional.pad(waveform, (0, padding))
        
        inputs = self.processor(waveform.squeeze(0), sampling_rate=16000, return_tensors="pt", padding=True)
        
        return {"input_values": inputs.input_values.squeeze(0), "labels": torch.tensor(label)}

# Load the datasets
train_df = pd.read_csv("./dataset/train_final.csv")
test_df = pd.read_csv("./dataset/test_final.csv")

# Combine the datasets to fit the LabelEncoder on all unique classes
combined_df = pd.concat([train_df, test_df])

# Initialize and fit the LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(combined_df["Common Name"])

# Instantiate the processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

# Load datasets directly using the fitted label encoder
train_dataset = BirdSoundDataset(csv_file="./dataset/train_final.csv", processor=processor, label_encoder=label_encoder)
test_dataset = BirdSoundDataset(csv_file="./dataset/test_final.csv", processor=processor, label_encoder=label_encoder)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Load the pre-trained Wav2Vec2 model
feature_extractor = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(device)

# Function to extract features
def extract_features(data_loader, model):
    model.eval()
    features = []
    labels = []
    
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            input_values = batch["input_values"].to(device)
            label = batch["labels"]
            
            outputs = model(input_values).last_hidden_state
            features.append(outputs.mean(dim=1).cpu().numpy())  # Average over time dimension
            labels.append(label.numpy())
            logger.info(f'Batch {i+1}/{len(data_loader)} processed')
    
    return np.vstack(features), np.hstack(labels)

# Extract features for train and test sets
logger.info("Extracting features for training set")
train_features, train_labels = extract_features(train_loader, feature_extractor)
logger.info("Extracting features for test set")
test_features, test_labels = extract_features(test_loader, feature_extractor)

# Save extracted features and labels to npy files
np.save("train_features.npy", train_features)
np.save("train_labels.npy", train_labels)
np.save("test_features.npy", test_features)
np.save("test_labels.npy", test_labels)

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2024-05-26 22:41:19,391 - INFO - Extracting features for training set
2024-05-26 22:41:23,359 - INFO - Batch 1/661 processed
2024-05-26 22:41:27,331 - INFO - Batch 2/661 processed
2024-05-26 22:41:31,209 - INFO - Batch 3/661 processed
2024-05-26 22:41:35,094 - INFO - Batch 4/661 processed
2024-05-26 22:41:39,049 - INFO - Batch 5/661 processed
2024-05-26 22:41:42,932 - INFO - Batch 6/661 processed
2024-05-26 22:41:46,796 - INFO - Batch 7/661 processed
2024-05-26 22:41:50,729 - INFO - Batch 8/661 processed
2024-05-26 22:41:54,724 - INFO - Batch 9/661 processed
2024-05-26 22:41:58,635 - INFO - Batch 10/661 processed
2024-05-26 22:42:02,515 - INFO - Batch 11/661 processed
2024-05-26 22:42:06,435 - INFO - Batch 12/661 

In [50]:
import pandas as pd
import numpy as np

# Load the datasets
train_df = pd.read_csv("dataset/train_final.csv")
test_df = pd.read_csv("dataset/test_final.csv")

# Combine the datasets to calculate the frequency of each class
combined_df = pd.concat([train_df, test_df])

# Calculate the frequency of each class
class_counts = combined_df["Common Name"].value_counts()

# Filter out classes with fewer than 3 instances
filtered_classes = class_counts[class_counts >= 3].index

# Filter the datasets
filtered_train_df = train_df[train_df["Common Name"].isin(filtered_classes)]
filtered_test_df = test_df[test_df["Common Name"].isin(filtered_classes)]

# Ensure the index is reset
filtered_train_df = filtered_train_df.reset_index(drop=True)
filtered_test_df = filtered_test_df.reset_index(drop=True)

# Load extracted features
train_features = np.load("train_features.npy")
test_features = np.load("test_features.npy")

# Filter the features based on the filtered datasets
filtered_train_features = train_features[filtered_train_df.index]
filtered_test_features = test_features[filtered_test_df.index]

# Save the filtered datasets and features (optional)
filtered_train_df.to_csv("dataset/train_final_filtered.csv", index=False)
filtered_test_df.to_csv("dataset/test_final_filtered.csv", index=False)
np.save("train_features_filtered.npy", filtered_train_features)
np.save("test_features_filtered.npy", filtered_test_features)


In [51]:
from sklearn.preprocessing import LabelEncoder

# Combine the filtered datasets to fit the LabelEncoder on all unique classes
filtered_combined_df = pd.concat([filtered_train_df, filtered_test_df])

# Initialize and fit the LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(filtered_combined_df["Common Name"])

# Encode labels
filtered_train_labels = label_encoder.transform(filtered_train_df["Common Name"])
filtered_test_labels = label_encoder.transform(filtered_test_df["Common Name"])

# Save encoded labels
np.save("train_labels_filtered.npy", filtered_train_labels)
np.save("test_labels_filtered.npy", filtered_test_labels)

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import joblib
import logging
import numpy as np

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load extracted features and encoded labels
train_features = np.load("train_features_filtered.npy")
test_features = np.load("test_features_filtered.npy")
train_labels = np.load("train_labels_filtered.npy")
test_labels = np.load("test_labels_filtered.npy")

# Check lengths
print(f"Length of train_features: {len(train_features)}")
print(f"Length of train_labels: {len(train_labels)}")
print(f"Length of test_features: {len(test_features)}")
print(f"Length of test_labels: {len(test_labels)}")

# Check unique labels in train_labels and test_labels
unique_train_labels = np.unique(train_labels)
unique_test_labels = np.unique(test_labels)
num_classes = len(unique_train_labels)

logger.info(f"Unique labels in train data: {unique_train_labels}")
logger.info(f"Unique labels in test data: {unique_test_labels}")

# Instantiate the XGBoost classifier with GPU support
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax', 
    num_class=num_classes,  # Use the number of unique classes in the training labels
    random_state=42, 
    tree_method='gpu_hist',  # Use GPU acceleration
    predictor='gpu_predictor'
)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Instantiate the StratifiedKFold object
skf = StratifiedKFold(n_splits=3)

# Instantiate the GridSearchCV object with Stratified K-Folds
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           scoring='accuracy', cv=skf, verbose=2, n_jobs=-1)

# Fit the grid search model
logger.info("Starting hyperparameter search")
grid_search.fit(train_features, train_labels)

# Get the best model
best_xgb = grid_search.best_estimator_

# Evaluate the best model
logger.info("Evaluating the best model")
train_preds = best_xgb.predict(train_features)
test_preds = best_xgb.predict(test_features)

train_accuracy = accuracy_score(train_labels, train_preds)
test_accuracy = accuracy_score(test_labels, test_preds)

logger.info(f"Train Accuracy: {train_accuracy:.4f}")
logger.info(f"Test Accuracy: {test_accuracy:.4f}")

# Save the best model
joblib.dump(best_xgb, 'best_xgboost_classifier.joblib')
logger.info("Best model saved")

# Decode the predicted and true labels
decoded_test_preds = label_encoder.inverse_transform(test_preds)
decoded_test_labels = label_encoder.inverse_transform(test_labels)

# Print the predicted and true labels
for i in range(len(decoded_test_labels)):
    logger.info(f"True label: {decoded_test_labels[i]}, Predicted label: {decoded_test_preds[i]}")

# Optionally save the results to a CSV file
results_df = pd.DataFrame({
    "File Path": filtered_test_df["File Path"],
    "True Label": decoded_test_labels,
    "Predicted Label": decoded_test_preds
})

results_df.to_csv("test_results.csv", index=False)
logger.info("Results saved to test_results.csv")


2024-05-27 00:09:31,829 - INFO - Unique labels in train data: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 2

Length of train_features: 4751
Length of train_labels: 4751
Length of test_features: 10
Length of test_labels: 10
Fitting 3 folds for each of 324 candidates, totalling 972 fits


In [None]:
# Evaluate the best model
logger.info("Evaluating the best model")
train_preds = best_xgb.predict(train_features)
test_preds = best_xgb.predict(test_features)

train_accuracy = accuracy_score(train_labels, train_preds)
test_accuracy = accuracy_score(test_labels, test_preds)

logger.info(f"Train Accuracy: {train_accuracy:.4f}")
logger.info(f"Test Accuracy: {test_accuracy:.4f}")

# Save the best model
joblib.dump(best_xgb, 'best_xgboost_classifier.joblib')
logger.info("Best model saved")

# Decode the predicted and true labels
decoded_test_preds = label_encoder.inverse_transform(test_preds)
decoded_test_labels = label_encoder.inverse_transform(test_labels)

# Print the predicted and true labels
for i in range(len(decoded_test_labels)):
    logger.info(f"True label: {decoded_test_labels[i]}, Predicted label: {decoded_test_preds[i]}")

# Optionally save the results to a CSV file
results_df = pd.DataFrame({
    "File Path": test_dataset.data["File Path"],
    "True Label": decoded_test_labels,
    "Predicted Label": decoded_test_preds
})

results_df.to_csv("test_results.csv", index=False)
logger.info("Results saved to test_results.csv")