In [8]:
# Model Setup
import torch
import numpy as np
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification


# Device Check
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple Silicon (MPS)")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU (training will be slower)")

# Custom Dataset Class
class MusicDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        tokens = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors='pt'
        )
        
        return {
            'input_ids': tokens['input_ids'].flatten(),
            'attention_mask': tokens['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load Data and Encoder
split_data = np.load('train_val_split.npy', allow_pickle=True).item()
X_train = split_data['X_train']
X_val = split_data['X_val']
y_train = split_data['y_train']
y_val = split_data['y_val']

label_classes = np.load('label_classes.npy', allow_pickle=True)
num_labels = len(label_classes)

assert len(X_train) == len(y_train), "Mismatch in training data"
assert num_labels > 1, "Only one class detected"

print(f"Train size: {len(X_train)}, Val size: {len(X_val)}")
print(f"Number of classes: {num_labels}")


# Load Tokenizer & Model

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModelForSequenceClassification.from_pretrained(
    'xlm-roberta-base',
    num_labels=num_labels
)
model = model.to(device)

# Create Datasets 
train_dataset = MusicDataset(X_train, y_train, tokenizer)
val_dataset = MusicDataset(X_val, y_val, tokenizer)

# Quick Test
sample = train_dataset[0]
print(f"Sample input tensor shape: {sample['input_ids'].shape}")
print(f"Sample original text: {X_train[0][:100]}...")
print(f"Sample label index: {y_train[0]}")

# Save Tokenizer
tokenizer.save_pretrained('./tokenizer')
print("Model and datasets are ready for training")

Using Apple Silicon (MPS)
Train size: 160, Val size: 40
Number of classes: 6


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Sample input tensor shape: torch.Size([512])
Sample original text: Album: ALPHA | Artist: Charlotte Day Wilson | Genre: R&B | Description: Contemporary R&B | Context: ...
Sample label index: 3
Model and datasets are ready for training
