In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import ASTModel, ASTConfig
from transformers import ASTForAudioClassification, ASTFeatureExtractor
import joblib
import os
import warnings
import librosa
warnings.filterwarnings('ignore')
from datasets import load_dataset
#from datasets import load_metric
import evaluate
from transformers import (
    Wav2Vec2Processor, 
    Wav2Vec2ForSequenceClassification, 
    TrainingArguments, 
    Trainer
)
import transformers
import accelerate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the processor and model
MODEL_NAME = "facebook/wav2vec2-base"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=10)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# def preprocess_audio(file_path):
#     audio, sr = librosa.load(file_path, sr=16000)  # Resample to 16kHz
#     inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
#     return inputs.input_values

def preprocess_audio(example):
    # Use preloaded waveform from the dataset
    audio_array = example["audio"]["array"]
    sampling_rate = example["audio"]["sampling_rate"]

    # Process audio with Wav2Vec2Processor
    example["input_values"] = processor(audio_array, sampling_rate=16000, return_tensors="pt",padding="max_length",max_length=661794, truncation=True ).input_values[0]

    # Keep genre label
    example["label"] = example["genre"]
    return example

def preprocess_audio2(example):
    audio, sr = librosa.load(example["audio"]["path"], sr=16000)  # Resample to 16kHz
    example["input_values"] = processor(audio, sampling_rate=16000, return_tensors="pt").input_values[0]
    example["label"] = example["genre"]
    return example

In [4]:
audio_data_path = '../data/audio'
y, sr = librosa.load(f'{audio_data_path}/genres_original/reggae/reggae.00036.wav')

In [5]:
# Classify audio genre
def predict_genre(file_path, label_mapping):
    input_values = preprocess_audio(file_path)
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_label = torch.argmax(logits, dim=-1).item()
    return label_mapping[predicted_label]

In [24]:
dataset = load_dataset("marsyas/gtzan")
dataset = dataset["train"].train_test_split(test_size=0.7)
label_mapping = dataset["train"].features["genre"].names

In [25]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 299
    })
    test: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 700
    })
})


In [26]:
dataset = dataset.map(preprocess_audio, remove_columns=["audio"])

Map:   0%|          | 0/299 [00:00<?, ? examples/s]

Map: 100%|██████████| 299/299 [00:23<00:00, 12.67 examples/s]
Map: 100%|██████████| 700/700 [01:25<00:00,  8.19 examples/s]


In [27]:
metric = evaluate.load("accuracy")

In [28]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [29]:
# Detect device (use GPU if available, otherwise fallback to CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"

In [30]:

print("Transformers:", transformers.__version__)  # Should be latest (>= 4.38.0)
print("Accelerate:", accelerate.__version__)  # Should be latest (>= 0.26.0)
print("Torch:", torch.__version__)  # Check if it matches your CUDA version (if using GPU)

Transformers: 4.48.3
Accelerate: 1.3.0
Torch: 2.6.0+cpu


In [32]:
training_args = TrainingArguments(
    output_dir="./wav2vec2-genre-classification",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    
    # Adjust batch size for 16GB RAM (use lower if needed)
    per_device_train_batch_size=1,  # Reduce if memory error occurs
    per_device_eval_batch_size=1,

    # Accumulate gradients to simulate larger batch size
    gradient_accumulation_steps=4,  
    
    # Reduce memory load
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,  # Log less often to reduce memory use

    # Optimization
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=5,
    warmup_steps=500,
    
    # Use FP16 for better performance if GPU is available
    fp16=torch.cuda.is_available(),

    # CPU Optimization
    dataloader_num_workers=2,  # Reduce workers to prevent CPU overloading
    dataloader_pin_memory=True if device == "cuda" else False,  

    # Other settings
    push_to_hub=False,
)

In [33]:
# Define training arguments
training_args1 = TrainingArguments(
    output_dir="./wav2vec2-genre-classification",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
)

In [34]:
import accelerate
print(accelerate.__version__)  # Should be >= 0.26.0

1.3.0


In [35]:
print("CUDA Available:", torch.cuda.is_available())  # Should print True if GPU is available
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

CUDA Available: False
GPU Name: No GPU detected


In [36]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=processor,
    compute_metrics=compute_metrics,
)

In [37]:
 # Automatically detect last checkpoint
checkpoint_dir = "./wav2vec2-genre-classification"
checkpoints = [ckpt for ckpt in os.listdir(checkpoint_dir) if "checkpoint" in ckpt]

if checkpoints:
    last_checkpoint = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]))[-1]
    resume_checkpoint = os.path.join(checkpoint_dir, last_checkpoint)
else:
    resume_checkpoint = None

In [38]:
# Train the model
trainer.train(resume_from_checkpoint=resume_checkpoint)

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Save the fine-tuned model
model.save_pretrained("./fine-tuned-wav2vec2-genre")

In [None]:
# Test the fine-tuned model
def predict_genre(file_path):
    audio, sr = librosa.load(file_path, sr=16000)
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
    
    with torch.no_grad():
        logits = model(inputs.input_values).logits
    
    predicted_label = torch.argmax(logits, dim=-1).item()
    return label_mapping[predicted_label]

In [None]:
# Example usage
#/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original/country/country.00002.wav
audio_file = f'{audio_data_path}/genres_original/reggae/reggae.00036.wav'  # Replace with your audio file
predicted_genre = predict_genre(audio_file)
print(f"Predicted Genre: {predicted_genre}")

In [None]:
# Load the model
#model.load_state_dict(torch.load(model_path, map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu")))

In [4]:
import random

num = random.randint(0, 48)
formatted_num = f"{num:02}"
print(formatted_num)

02
