In [None]:
pip install datasets



In [None]:
pip install evaluate



In [None]:
pip install drive



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import ASTModel, ASTConfig
from transformers import ASTForAudioClassification, ASTFeatureExtractor
import joblib
import os
import warnings
import librosa
warnings.filterwarnings('ignore')
from datasets import load_dataset
#from datasets import load_metric
import evaluate
from transformers import (
    Wav2Vec2Processor,
    Wav2Vec2ForSequenceClassification,
    TrainingArguments,
    Trainer
)
import transformers
import accelerate
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the processor and model
MODEL_NAME = "facebook/wav2vec2-base"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=10)

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def preprocess_audio(example):
    # Use preloaded waveform from the dataset
    audio_array = example["audio"]["array"]
    sampling_rate = example["audio"]["sampling_rate"]

    # Process audio with Wav2Vec2Processor
    example["input_values"] = processor(audio_array, sampling_rate=16000, return_tensors="pt",padding="max_length",max_length=661794, truncation=True ).input_values[0]

    # Keep genre label
    example["label"] = example["genre"]
    return example

In [None]:
# Classify audio genre
def predict_genre(file_path, label_mapping):
    input_values = preprocess_audio(file_path)
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_label = torch.argmax(logits, dim=-1).item()
    return label_mapping[predicted_label]

In [None]:
dataset = load_dataset("marsyas/gtzan", trust_remote_code=True)
dataset = dataset["train"].train_test_split(test_size=0.2)
label_mapping = dataset["train"].features["genre"].names

README.md:   0%|          | 0.00/4.42k [00:00<?, ?B/s]

gtzan.py:   0%|          | 0.00/3.35k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

genres.tar.gz:   0%|          | 0.00/1.23G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 799
    })
    test: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 200
    })
})


In [None]:
dataset = dataset.map(preprocess_audio, remove_columns=["audio"])

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# Detect device (use GPU if available, otherwise fallback to CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
print("Transformers:", transformers.__version__)  # Should be latest (>= 4.38.0)
print("Accelerate:", accelerate.__version__)  # Should be latest (>= 0.26.0)
print("Torch:", torch.__version__)  # Check if it matches your CUDA version (if using GPU)

Transformers: 4.48.3
Accelerate: 1.3.0
Torch: 2.5.1+cu124


In [None]:
training_args = TrainingArguments(
    output_dir="./wav2vec2-genre-classification",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    run_name="./my_experiment_run",

    # Adjust batch size for 16GB RAM (use lower if needed)
    per_device_train_batch_size=2,  # Reduce if memory error occurs
    per_device_eval_batch_size=2,

    # Accumulate gradients to simulate larger batch size
    gradient_accumulation_steps=4,

    # Reduce memory load
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,  # Log less often to reduce memory use

    # Optimization
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=5,
    warmup_steps=500,

    # Use FP16 for better performance if GPU is available
    fp16=torch.cuda.is_available(),

    # CPU Optimization
    dataloader_num_workers=2,  # Reduce workers to prevent CPU overloading
    dataloader_pin_memory=True if device == "cuda" else False,

    # Other settings
    push_to_hub=False,
)

In [None]:
print("CUDA Available:", torch.cuda.is_available())  # Should print True if GPU is available
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

CUDA Available: True
GPU Name: Tesla T4


In [None]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=processor,
    compute_metrics=compute_metrics,
)

In [None]:
 # Automatically detect last checkpoint
checkpoint_dir = "./wav2vec2-genre-classification"
checkpoints = [ckpt for ckpt in os.listdir(checkpoint_dir) if "checkpoint" in ckpt]

if checkpoints:
    last_checkpoint = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]))[-1]
    resume_checkpoint = os.path.join(checkpoint_dir, last_checkpoint)
else:
    resume_checkpoint = None

In [None]:
# Train the model
trainer.train(resume_from_checkpoint=resume_checkpoint)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrajiv-kumar-102[0m ([33mrajiv-kumar-102-bits-pilani[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy
1,2.2455,2.172432,0.455
2,1.8979,1.759439,0.525
3,1.6619,1.635876,0.535
4,1.3515,1.380653,0.545
5,1.3029,1.302317,0.625


TrainOutput(global_step=500, training_loss=1.7404359130859375, metrics={'train_runtime': 2815.2097, 'train_samples_per_second': 1.419, 'train_steps_per_second': 0.178, 'total_flos': 1.5002012093922995e+18, 'train_loss': 1.7404359130859375, 'epoch': 5.0})

In [None]:
# Save the fine-tuned model
#model.save_pretrained("./fine-tuned-wav2vec2-genre")
model.save_pretrained("/content/drive/MyDrive/fine-tuned-wav2vec2-genre")

In [None]:
# Test the fine-tuned model
def predict_genre(file_path):
    audio, sr = librosa.load(file_path, sr=16000)
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
    # Move inputs to the same device as the model
    inputs = inputs.to(device)  # 'device' should be defined as "cuda" or "cpu" as before

    with torch.no_grad():
        logits = model(inputs.input_values).logits

    predicted_label = torch.argmax(logits, dim=-1).item()
    return label_mapping[predicted_label]

In [None]:
#pip install drive


In [None]:
#/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original/country/country.00002.wav

In [None]:
# Example usage

audio_file = f'/content/drive/MyDrive/blues.00005.wav'  # audio file
predicted_genre = predict_genre(audio_file)
print(f"Predicted Genre: {predicted_genre}")

Predicted Genre: blues
