In [1]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
import pandas as pd
import os

# Load Wav2Vec2 processor
model_name = "facebook/wav2vec2-base"
processor = Wav2Vec2Processor.from_pretrained(model_name)

def load_audio(file_path, target_length=160000):
    """
    Loads an audio file and ensures a fixed length by padding or truncating.
    
    Parameters:
        file_path (str): Path to the audio file.
        target_length (int): Fixed length for all audio samples (e.g., 10s at 16kHz = 160000 samples).

    Returns:
        torch.Tensor: Fixed-length waveform tensor.
    """
    waveform, sample_rate = torchaudio.load(file_path)
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)  # Convert to 16kHz

    # Ensure waveform is mono (1 channel)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    # Pad or truncate to target length
    if waveform.shape[1] < target_length:
        pad_amount = target_length - waveform.shape[1]
        waveform = torch.nn.functional.pad(waveform, (0, pad_amount))
    else:
        waveform = waveform[:, :target_length]  # Truncate

    return waveform.squeeze(0)


# Load dataset labels
df = pd.read_csv(r"dataset\labels.csv")

# Prepare dataset
train_audio = []
train_labels = []

for _, row in df.iterrows():
    file_path = os.path.join("dataset/train", row["File Name"])
    waveform = load_audio(file_path)
    train_audio.append(waveform)
    train_labels.append(row["Label"])

# Convert to tensors
train_audio = torch.stack(train_audio)
train_labels = torch.tensor(train_labels)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2ForSequenceClassification, Trainer, TrainingArguments

# ✅ Step 1: Define Custom Audio Dataset
class AudioDataset(Dataset):
    def __init__(self, audio_data, labels):
        self.audio_data = audio_data
        self.labels = labels

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, idx):
        return {
            "input_values": self.audio_data[idx],
            "labels": self.labels[idx]
        }

# ✅ Step 2: Function to Load Wav2Vec2 Model
def load_model(model_name="facebook/wav2vec2-base"):
    """
    Loads the Wav2Vec2 model for binary classification (copyright detection).
    """
    model = Wav2Vec2ForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        problem_type="single_label_classification"
    )
    return model

# ✅ Step 3: Train Model on Full Dataset and Evaluate on Same Data
def train_model(train_audio, train_labels, model_name="facebook/wav2vec2-base", output_dir="./wav2vec2_copyright"):
    """
    Trains Wav2Vec2 on the provided dataset and evaluates on the same dataset.
    """
    # Use full dataset for both training and evaluation
    train_dataset = AudioDataset(train_audio, train_labels)
    eval_dataset = train_dataset  # ✅ Evaluating on the same dataset

    # Load model
    model = load_model(model_name)

    # Define Training Arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=8,
        learning_rate=5e-5,
        num_train_epochs=15,
        logging_dir="./logs",
        logging_steps=10,
        save_steps=500,
        evaluation_strategy="epoch"  # ✅ Evaluates after every epoch
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset  # ✅ Now evaluating on full training data
    )

    # Train model
    trainer.train()

    # Save fine-tuned model
    model.save_pretrained(output_dir)
    print(f"✅ Model saved to {output_dir}")

train_model(train_audio, train_labels)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
                                              
  7%|▋         | 1/15 [00:15<03:05, 13.26s/it]

{'eval_loss': 0.5930120348930359, 'eval_runtime': 2.2282, 'eval_samples_per_second': 2.244, 'eval_steps_per_second': 0.449, 'epoch': 1.0}


                                              
 13%|█▎        | 2/15 [00:29<02:57, 13.67s/it]

{'eval_loss': 0.49818211793899536, 'eval_runtime': 2.2342, 'eval_samples_per_second': 2.238, 'eval_steps_per_second': 0.448, 'epoch': 2.0}


                                              
 20%|██        | 3/15 [00:43<02:44, 13.69s/it]

{'eval_loss': 0.4333652853965759, 'eval_runtime': 2.3194, 'eval_samples_per_second': 2.156, 'eval_steps_per_second': 0.431, 'epoch': 3.0}


                                              
 27%|██▋       | 4/15 [00:56<02:30, 13.66s/it]

{'eval_loss': 0.3959631323814392, 'eval_runtime': 2.2295, 'eval_samples_per_second': 2.243, 'eval_steps_per_second': 0.449, 'epoch': 4.0}


                                              
 33%|███▎      | 5/15 [01:10<02:16, 13.62s/it]

{'eval_loss': 0.3543678820133209, 'eval_runtime': 2.2975, 'eval_samples_per_second': 2.176, 'eval_steps_per_second': 0.435, 'epoch': 5.0}


                                              
 40%|████      | 6/15 [01:24<02:04, 13.80s/it]

{'eval_loss': 0.32235580682754517, 'eval_runtime': 2.2644, 'eval_samples_per_second': 2.208, 'eval_steps_per_second': 0.442, 'epoch': 6.0}


                                              
 47%|████▋     | 7/15 [01:38<01:49, 13.71s/it]

{'eval_loss': 0.2906649112701416, 'eval_runtime': 2.2386, 'eval_samples_per_second': 2.234, 'eval_steps_per_second': 0.447, 'epoch': 7.0}


                                              
 53%|█████▎    | 8/15 [01:51<01:35, 13.63s/it]

{'eval_loss': 0.26951104402542114, 'eval_runtime': 2.2105, 'eval_samples_per_second': 2.262, 'eval_steps_per_second': 0.452, 'epoch': 8.0}


                                              
 60%|██████    | 9/15 [02:04<01:21, 13.58s/it]

{'eval_loss': 0.25194209814071655, 'eval_runtime': 2.2368, 'eval_samples_per_second': 2.235, 'eval_steps_per_second': 0.447, 'epoch': 9.0}


 67%|██████▋   | 10/15 [02:16<01:07, 13.54s/it]

{'loss': 0.4209, 'grad_norm': 1.8948373794555664, 'learning_rate': 1.6666666666666667e-05, 'epoch': 10.0}


                                               
 67%|██████▋   | 10/15 [02:18<01:07, 13.54s/it]

{'eval_loss': 0.2332443743944168, 'eval_runtime': 2.2593, 'eval_samples_per_second': 2.213, 'eval_steps_per_second': 0.443, 'epoch': 10.0}


                                               
 73%|███████▎  | 11/15 [02:31<00:54, 13.54s/it]

{'eval_loss': 0.21643559634685516, 'eval_runtime': 2.2956, 'eval_samples_per_second': 2.178, 'eval_steps_per_second': 0.436, 'epoch': 11.0}


                                               
 80%|████████  | 12/15 [02:45<00:40, 13.55s/it]

{'eval_loss': 0.20469920337200165, 'eval_runtime': 2.287, 'eval_samples_per_second': 2.186, 'eval_steps_per_second': 0.437, 'epoch': 12.0}


                                               
 87%|████████▋ | 13/15 [02:59<00:27, 13.56s/it]

{'eval_loss': 0.1965031772851944, 'eval_runtime': 2.2601, 'eval_samples_per_second': 2.212, 'eval_steps_per_second': 0.442, 'epoch': 13.0}


                                               
 93%|█████████▎| 14/15 [03:12<00:13, 13.59s/it]

{'eval_loss': 0.1913081556558609, 'eval_runtime': 2.2512, 'eval_samples_per_second': 2.221, 'eval_steps_per_second': 0.444, 'epoch': 14.0}


                                               
100%|██████████| 15/15 [03:27<00:00, 13.84s/it]


{'eval_loss': 0.18876613676548004, 'eval_runtime': 2.3921, 'eval_samples_per_second': 2.09, 'eval_steps_per_second': 0.418, 'epoch': 15.0}
{'train_runtime': 207.5921, 'train_samples_per_second': 0.361, 'train_steps_per_second': 0.072, 'train_loss': 0.3560316562652588, 'epoch': 15.0}
✅ Model saved to ./wav2vec2_copyright


In [7]:
def predict_audio(audio_path, model_path="./wav2vec2_copyright"):
    """
    Predicts whether an audio clip is copyrighted.
    """
    # Load fine-tuned model
    model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path)
    
    # Load and process audio
    audio = load_audio(audio_path)
    inputs = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000)

    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits).item()
    
    return "Copyrighted" if predicted_class == 1 else "Not Copyrighted"

# Test with a new audio clip
test_audio = "dataset/test/test3.mp3"
result = predict_audio(test_audio)
print(f"🔍 Result: {result}")


🔍 Result: Copyrighted
