In [1]:
import os
import numpy as np
import librosa
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AutoProcessor, ASTForAudioClassification
from sklearn.model_selection import train_test_split

# Parameters
BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 5e-5

In [2]:
class GenreDataset(Dataset):
    def __init__(self, root_dir, processor):
        self.data = []
        self.labels = []
        self.genre_to_label = {
            "blues": 0, "classical": 1, "country": 2, "disco": 3,
            "hiphop": 4, "jazz": 5, "metal": 6, "pop": 7, "reggae": 8, "rock": 9
        }
        self.processor = processor

        for genre, label in self.genre_to_label.items():
            genre_dir = os.path.join(root_dir, genre)
            for file in os.listdir(genre_dir):
                file_path = os.path.join(genre_dir, file)
                try:
                    # Test if the file can be loaded
                    librosa.load(file_path, sr=16000)
                    self.data.append(file_path)
                    self.labels.append(label)
                except Exception as e:
                    print(f"Skipping file {file_path}: {e}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path = self.data[idx]
        label = self.labels[idx]

        audio, sr = librosa.load(file_path, sr=16000)
        inputs = self.processor(audio, sampling_rate=sr, return_tensors="pt")
        return inputs["input_values"].squeeze(0), label

In [3]:
# Model and Training Loop
def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    for epoch in range(EPOCHS):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs, labels=labels)
            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch + 1}/{EPOCHS}, Loss: {running_loss / len(train_loader):.4f}")

# Evaluation Function
def evaluate_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    print(f"Test Accuracy: {accuracy:.4f}")
    return accuracy


def evaluate_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(inputs)
            logits = outputs.logits  # Extract logits
            
            # Compute predictions
            _, predicted = torch.max(logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = correct / total
    print(f"Test Accuracy: {accuracy:.4f}")
    return accuracy


In [4]:
# Main Function
def main():
    DATA_DIR = "/kaggle/input/gtzan-dataset/Data/genres_original"  # Path to GTZAN dataset
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load processor and model
    processor = AutoProcessor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
    model = ASTForAudioClassification.from_pretrained(
        "MIT/ast-finetuned-audioset-10-10-0.4593",
        num_labels=10,
        ignore_mismatched_sizes=True
    )
    model.to(device)

    # Load dataset
    dataset = GenreDataset(DATA_DIR, processor)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    # Optimizer and Loss
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    criterion = torch.nn.CrossEntropyLoss()

    # Train and Evaluate
    print("Training model...")
    train_model(model, train_loader, optimizer, criterion, device)

    print("Evaluating model...")
    evaluate_model(model, test_loader, device)

    # Save model
    model.save_pretrained("./ast_genre_classification")
    processor.save_pretrained("./ast_genre_classification")

if __name__ == "__main__":
    main()

preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  librosa.load(file_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Skipping file /kaggle/input/gtzan-dataset/Data/genres_original/jazz/jazz.00054.wav: 
Training model...
Epoch 1/10, Loss: 0.9272
Epoch 2/10, Loss: 0.3102
Epoch 3/10, Loss: 0.1021
Epoch 4/10, Loss: 0.0334
Epoch 5/10, Loss: 0.0306
Epoch 6/10, Loss: 0.1416
Epoch 7/10, Loss: 0.1442
Epoch 8/10, Loss: 0.0189
Epoch 9/10, Loss: 0.0030
Epoch 10/10, Loss: 0.0017
Evaluating model...
Test Accuracy: 0.8550
