<a href="https://colab.research.google.com/github/Viditk07-Bits/AudioAnalytics_S2-24_AIMLCZG527/blob/main/AA_Assignment2_latest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Music Information Retrieval System

## Assignment Objective
This assignment implements a comprehensive Music Information Retrieval (MIR) system using Large Language Models (LLMs) and deep learning techniques. It includes music recommendation, genre classification, and semantic search applications, combining audio analysis with natural language processing.

## Dataset Setup
Using the Free Music Archive (FMA) dataset with audio files, metadata, and synthetic user data.

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import cosine
import os
from pathlib import Path
import librosa
import hashlib
import subprocess
from google.colab import drive
from librosa.feature.rhythm import tempo as librosa_tempo

# Mount Google Drive (optional, for persistent storage)
drive.mount('/content/drive', force_remount=True)

# Constants
DATA_PATH = "/content/fma/data"
AUDIO_PATH = os.path.join(DATA_PATH, "audio_files/")
METADATA_PATH = os.path.join(DATA_PATH, "metadata/tracks.csv")
ARTISTS_PATH = os.path.join(DATA_PATH, "metadata/artists.csv")
GENRES_PATH = os.path.join(DATA_PATH, "metadata/genres.csv")
LYRICS_PATH = os.path.join(DATA_PATH, "lyrics/")
USER_DATA_PATH = os.path.join(DATA_PATH, "user_data/ratings.csv")
TAGS_PATH = os.path.join(DATA_PATH, "descriptions/tags.csv")
OUTPUT_DIR = "outputs/"
TEMP_DIR = "/content/fma"
NUM_EPOCHS = 10
BATCH_SIZE = 32
MAX_TRACKS = 1000  # Limit for faster testing
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

def setup_fma_dataset():
    """Download and set up FMA dataset, moving files to expected structure."""
    os.makedirs(TEMP_DIR, exist_ok=True)
    os.chdir(TEMP_DIR)

    # Step 1: Clone FMA GitHub repository
    if not os.path.exists(os.path.join(TEMP_DIR, "fma")):
        print("Cloning FMA repository...")
        subprocess.run(["git", "clone", "https://github.com/mdeff/fma.git"], check=True)
    else:
        print("FMA repository already exists.")
    os.chdir(os.path.join(TEMP_DIR, "fma"))

    # Step 2: Download fma_small.zip and fma_metadata.zip
    fma_small_zip = "fma_small.zip"
    fma_metadata_zip = "fma_metadata.zip"

    if os.path.exists(fma_small_zip):
        os.remove(fma_small_zip)
        print(f"Removed existing {fma_small_zip}")
    if os.path.exists(fma_metadata_zip):
        os.remove(fma_metadata_zip)
        print(f"Removed existing {fma_metadata_zip}")

    print("Downloading fma_small.zip...")
    subprocess.run(["wget", "-O", fma_small_zip, "https://os.unil.cloud.switch.ch/fma/fma_small.zip"], check=True)
    print("Downloading fma_metadata.zip...")
    subprocess.run(["wget", "-O", fma_metadata_zip, "https://os.unil.cloud.switch.ch/fma/fma_metadata.zip"], check=True)

    # Step 3: Verify SHA1 checksums
    def sha1_checksum(file_path):
        sha1 = hashlib.sha1()
        with open(file_path, 'rb') as f:
            while chunk := f.read(8192):
                sha1.update(chunk)
        return sha1.hexdigest()

    assert sha1_checksum("fma_small.zip") == "ade154f733639d52e35e32f5593efe5be76c6d70", "fma_small.zip checksum failed!"
    assert sha1_checksum("fma_metadata.zip") == "f0df49ffe5f2a6008d7dc83c6915b31835dfe733", "fma_metadata.zip checksum failed!"
    print("✅ SHA1 checksums verified.")

    # Step 4: Unzip files
    os.makedirs(DATA_PATH, exist_ok=True)
    if not os.path.exists(os.path.join(DATA_PATH, "fma_small")):
        print("Unzipping fma_small.zip...")
        subprocess.run(["unzip", "-q", "fma_small.zip", "-d", DATA_PATH], check=True)
    else:
        print("fma_small already unzipped.")

    if not os.path.exists(os.path.join(DATA_PATH, "fma_metadata")):
        print("Unzipping fma_metadata.zip...")
        subprocess.run(["unzip", "-q", "fma_metadata.zip", "-d", DATA_PATH], check=True)
    else:
        print("fma_metadata already unzipped.")

    # Step 5: Move .mp3 files to audio_files/
    os.makedirs(AUDIO_PATH, exist_ok=True)
    print("Moving MP3 files...")
    for mp3_file in Path(DATA_PATH).rglob("*.mp3"):
        target = os.path.join(AUDIO_PATH, mp3_file.name)
        if not os.path.exists(target):
            os.rename(mp3_file, target)
    print("MP3 files moved.")

    # Step 6: Process metadata
    print("Processing metadata...")
    tracks = pd.read_csv(os.path.join(DATA_PATH, "fma_metadata", "tracks.csv"), index_col=0, header=[0, 1])
    genres = pd.read_csv(os.path.join(DATA_PATH, "fma_metadata", "genres.csv"))

    # Create artists.csv
    df_artists = tracks['artist'][['name']].reset_index().rename(columns={'track_id': 'artist_id', 'name': 'artist_name'})
    df_artists['artist_id'] = df_artists['artist_id'].astype(str).str.zfill(6)
    os.makedirs(os.path.dirname(ARTISTS_PATH), exist_ok=True)
    df_artists.to_csv(ARTISTS_PATH, index=False)

    # Create genres.csv
    df_genres = genres[['genre_id', 'title']].rename(columns={'title': 'genre_name'})
    os.makedirs(os.path.dirname(GENRES_PATH), exist_ok=True)
    df_genres.to_csv(GENRES_PATH, index=False)

    # Adapt tracks.csv
    df_tracks = tracks['track'][['title', 'genre_top']].reset_index()
    df_tracks['track_id'] = df_tracks['track_id'].astype(str).str.zfill(6)
    df_tracks['artist_id'] = df_tracks['track_id']  # FMA doesn't provide artist_id, use track_id as proxy
    df_tracks['genre_id'] = df_tracks['genre_top'].map(df_genres.set_index('genre_name')['genre_id'])
    df_tracks = df_tracks[['track_id', 'title', 'artist_id', 'genre_id']].dropna()
    df_tracks = df_tracks.head(MAX_TRACKS)  # Limit tracks for faster processing
    os.makedirs(os.path.dirname(METADATA_PATH), exist_ok=True)
    df_tracks.to_csv(METADATA_PATH, index=False)

    # Create synthetic ratings.csv
    os.makedirs(os.path.dirname(USER_DATA_PATH), exist_ok=True)
    ratings = pd.DataFrame({
        'user_id': ['user_001'] * len(df_tracks),
        'track_id': df_tracks['track_id'],
        'rating': np.random.uniform(0.1, 1.0, len(df_tracks))
    })
    ratings.to_csv(USER_DATA_PATH, index=False)

    # Create empty lyrics/ and descriptions/
    os.makedirs(LYRICS_PATH, exist_ok=True)
    os.makedirs(os.path.dirname(TAGS_PATH), exist_ok=True)
    pd.DataFrame({'track_id': df_tracks['track_id'], 'tag': ['music'] * len(df_tracks)}).to_csv(TAGS_PATH, index=False)

    print("🎵 Metadata and audio files are ready.")
    print(f"Tracks shape: {df_tracks.shape}")
    print(f"Genres shape: {df_genres.shape}")
    print(f"Ratings shape: {ratings.shape}")

def extract_audio_features(audio_path):
    """Extract audio features (MFCCs, chroma, spectral features, tempo) from an MP3 file using Librosa."""
    try:
        y, sr = librosa.load(audio_path, sr=22050)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        tempo = librosa_tempo(y=y, sr=sr)[0]
        tempo = tempo[0] if isinstance(tempo, np.ndarray) else tempo

        return np.concatenate([
            np.mean(mfccs, axis=1),
            np.mean(chroma, axis=1),
            np.mean(spectral_centroid, axis=1),
            np.mean(spectral_contrast, axis=1),
            [tempo]  # Ensure this is a 1D list
        ])

    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return np.zeros(40)  # 20 MFCCs + 12 chroma + 1 centroid + 7 contrast + 1 tempo

def load_fma_data(audio_path, metadata_path, artists_path, genres_path, lyrics_path=None, tags_path=None):
    """Load and preprocess FMA dataset from MP3 files, metadata, and lyrics."""
    if not os.path.exists(audio_path):
        print(f"Error: Audio directory {audio_path} does not exist.")
        return pd.DataFrame(), pd.DataFrame(), {}

    # Load metadata
    try:
        df_tracks = pd.read_csv(metadata_path)
        df_metadata = df_tracks[['track_id', 'title', 'artist_id', 'genre_id']].dropna()
        if os.path.exists(artists_path):
            df_artists = pd.read_csv(artists_path)[['artist_id', 'artist_name']]
            df_metadata = pd.merge(df_metadata, df_artists, on='artist_id', how='left')
        else:
            df_metadata['artist_name'] = 'Unknown Artist'

        if os.path.exists(genres_path):
            df_genres = pd.read_csv(genres_path)[['genre_id', 'genre_name']]
            df_metadata = pd.merge(df_metadata, df_genres, on='genre_id', how='left')
        else:
            df_metadata['genre_name'] = 'unknown'

        df_metadata = df_metadata[['track_id', 'artist_name', 'title', 'genre_name']].dropna()
        df_metadata.columns = ['track_id', 'artist_name', 'title', 'genre']
        df_metadata['track_id'] = df_metadata['track_id'].astype(str).str.zfill(6)
        print(f"Initial metadata shape: {df_metadata.shape}")
    except Exception as e:
        print(f"Error loading metadata: {e}. Creating synthetic metadata.")
        df_metadata = pd.DataFrame(columns=['track_id', 'artist_name', 'title', 'genre'])

    # Extract audio features
    features = []
    audio_files = list(Path(audio_path).glob("*.mp3"))
    print(f"Found {len(audio_files)} audio files.")
    valid_track_ids = df_metadata['track_id'].tolist()
    audio_files_to_process = []
    processed_count = 0
    for audio_file in audio_files:
        track_id = audio_file.stem
        if track_id in valid_track_ids:
            audio_files_to_process.append(audio_file)
            processed_count += 1
            if processed_count >= MAX_TRACKS:
                break

    print(f"Processing features for {len(audio_files_to_process)} relevant audio files.")
    for audio_file in audio_files_to_process:
        track_id = audio_file.stem
        audio_features = extract_audio_features(audio_file)
        if audio_features is not None and audio_features.shape[0] > 0:
            features.append([track_id] + audio_features.tolist())
        else:
            print(f"Skipping {track_id} due to feature extraction error.")

    feature_columns = ['track_id'] + [f'mfcc_{i+1}' for i in range(20)] + [f'chroma_{i+1}' for i in range(12)] + \
                     ['spectral_centroid'] + [f'spectral_contrast_{i+1}' for i in range(7)] + ['tempo']
    df_features = pd.DataFrame(features, columns=feature_columns).dropna()
    print(f"Extracted features for {len(df_features)} tracks")

    # Filter metadata to match available audio features
    df_metadata = pd.merge(df_metadata, df_features[['track_id']], on='track_id', how='inner')
    print(f"Filtered metadata shape: {df_metadata.shape}")

    # Load lyrics and tags
    lyrics_dict = {}
    if lyrics_path and os.path.exists(lyrics_path):
        for lyric_file in Path(lyrics_path).glob("*.txt"):
            track_id = lyric_file.stem
            if track_id in df_metadata['track_id'].values:
                with open(lyric_file, 'r', encoding='utf-8') as f:
                    lyrics_dict[track_id] = f.read().strip()

    if tags_path and os.path.exists(tags_path):
        try:
            df_tags = pd.read_csv(tags_path)
            for _, row in df_tags.iterrows():
                track_id = str(row['track_id']).zfill(6)
                if track_id in df_metadata['track_id'].values:
                    tag = str(row['tag'])
                    if track_id in lyrics_dict:
                        lyrics_dict[track_id] += " " + tag
                    else:
                        lyrics_dict[track_id] = tag
        except Exception as e:
            print(f"Error loading tags: {e}")

    return df_metadata, df_features, lyrics_dict

def generate_text_embeddings(lyrics_dict):
    """Generate embeddings for lyrics and tags using Sentence-Transformers on GPU."""
    model = SentenceTransformer('all-MiniLM-L6-v2', device=DEVICE)
    embeddings = {}
    for track_id, text in lyrics_dict.items():
        embeddings[track_id] = model.encode(text, convert_to_tensor=True, device=DEVICE).cpu().numpy()
    return embeddings

# Original GenreClassifier and train/evaluate functions
class GenreClassifier(nn.Module):
    def __init__(self, audio_dim, text_dim, num_classes, hidden_dim=128):
        super(GenreClassifier, self).__init__()
        self.audio_layer = nn.Linear(audio_dim, hidden_dim)
        self.text_layer = nn.Linear(text_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, audio_features, text_features):
        audio_out = torch.relu(self.audio_layer(audio_features))
        text_out = torch.relu(self.text_layer(text_features))
        combined = torch.cat([audio_out, text_out], dim=1)
        return self.fc(combined)

def train_classifier(model, train_loader, criterion, optimizer):
    model.train()
    total_loss = 0
    for audio_features, text_features, labels in train_loader:
        audio_features, text_features, labels = (
            audio_features.to(DEVICE),
            text_features.to(DEVICE),
            labels.to(DEVICE)
        )
        optimizer.zero_grad()
        outputs = model(audio_features, text_features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate_classifier(model, test_loader, genres):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for audio_features, text_features, labels in test_loader:
            audio_features, text_features, labels = (
                audio_features.to(DEVICE),
                text_features.to(DEVICE),
                labels.to(DEVICE)
            )
            outputs = model(audio_features, text_features)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=genres, yticklabels=genres)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.savefig(os.path.join(OUTPUT_DIR, 'confusion_matrix.png'))
    plt.close()
    return precision, recall, f1

def train_recommender(model, train_loader, criterion, optimizer):
    model.train()
    total_loss = 0
    for audio_features, text_features, ratings in train_loader:
        audio_features, text_features, ratings = (
            audio_features.to(DEVICE),
            text_features.to(DEVICE),
            ratings.to(DEVICE)
        )
        optimizer.zero_grad()
        outputs = model(audio_features, text_features)
        loss = criterion(outputs, ratings)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate_recommender(model, test_loader):
    model.eval()
    precisions = []
    with torch.no_grad():
        for audio_features, text_features, ratings in test_loader:
            audio_features, text_features, ratings = (
                audio_features.to(DEVICE),
                text_features.to(DEVICE),
                ratings.to(DEVICE)
            )
            outputs = model(audio_features, text_features)
            k = min(10, outputs.size(1))
            if k == 0:
                print("Warning: No tracks available for top-k selection")
                continue
            top_k = torch.topk(outputs, k=k, dim=1).indices
            relevant = (ratings.gather(1, top_k) > 0.5).float()
            precision = relevant.mean().item()
            precisions.append(precision)
    return np.mean(precisions) if precisions else 0.0

Mounted at /content/drive


In [2]:
# 2.1.1 Audio Feature Integration with LLMs
class AudioFeatureExtractor:
    def __init__(self, sample_rate=22050):
        self.sample_rate = sample_rate

    def extract_features(self, audio_path):
        try:
            y, sr = librosa.load(audio_path, sr=self.sample_rate)
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
            spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
            spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
            chroma = librosa.feature.chroma_stft(y=y, sr=sr)
            tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
            return np.concatenate([
                np.mean(mfcc, axis=1),
                np.mean(spectral_centroid, axis=1),
                np.mean(spectral_contrast, axis=1),
                np.mean(chroma, axis=1),
                [tempo]
            ])
        except Exception as e:
            print(f"Error extracting features from {audio_path}: {e}")
            return np.zeros(40)  # 20 MFCCs + 1 centroid + 7 contrast + 12 chroma + 1 tempo

class AudioEmbedding(nn.Module):
    def __init__(self, input_dim, hidden_dim=128):
        super(AudioEmbedding, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU()
        )

    def forward(self, x):
        return self.network(x)

class CrossModalAttention(nn.Module):
    def __init__(self, audio_dim, text_dim, attn_dim=128):
        super(CrossModalAttention, self).__init__()
        self.audio_proj = nn.Linear(audio_dim, attn_dim)
        self.text_proj = nn.Linear(text_dim, attn_dim)
        self.attention = nn.MultiheadAttention(embed_dim=attn_dim, num_heads=4, batch_first=True)

    def forward(self, audio_emb, text_emb):
        audio_emb = self.audio_proj(audio_emb).unsqueeze(1)  # (B, 1, attn_dim)
        text_emb = self.text_proj(text_emb).unsqueeze(1)     # (B, 1, attn_dim)
        attn_output, _ = self.attention(audio_emb, text_emb, text_emb)
        return attn_output.squeeze(1)

def analyze_feature_contribution(model, features, labels):
    contributions = {}
    feature_groups = {
        'mfcc': slice(0, 20),
        'spectral_centroid': slice(20, 21),
        'spectral_contrast': slice(21, 28),
        'chroma': slice(28, 40),
        'tempo': slice(40, 41)
    }

    with torch.no_grad():
        base_pred = model(features).cpu().numpy()
        base_pred_labels = np.argmax(base_pred, axis=1)
        base_error = np.mean((base_pred_labels - labels.cpu().numpy()) ** 2)

        for feature_name, feature_slice in feature_groups.items():
            temp_features = features.clone()
            temp_features[:, feature_slice] = 0
            pred = model(temp_features).cpu().numpy()
            pred_labels = np.argmax(pred, axis=1)
            error = np.mean((pred_labels - labels.cpu().numpy()) ** 2)
            contributions[feature_name] = error - base_error

    return contributions

In [3]:
# 2.1.2 Text-Based Genre Classification
class TextGenreClassifier(nn.Module):
    def __init__(self, model_name='all-MiniLM-L6-v2', num_classes=10):
        super(TextGenreClassifier, self).__init__()
        self.model = SentenceTransformer(model_name, device=DEVICE)
        self.classifier = nn.Linear(384, num_classes)  # MiniLM-L6-v2 has 384-dim embeddings

    def forward(self, lyrics):
        embeddings = self.model.encode(lyrics, convert_to_tensor=True, device=DEVICE)
        return self.classifier(embeddings)

    def fine_tune(self, lyrics_data, labels, epochs=3):
        self.train()
        optimizer = torch.optim.Adam(self.classifier.parameters(), lr=2e-5)
        criterion = nn.CrossEntropyLoss()
        for epoch in range(epochs):
            optimizer.zero_grad()
            outputs = self(lyrics_data)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            print(f"Fine-tuning Epoch {epoch+1}, Loss: {loss.item():.4f}")

    def zero_shot_classify(self, lyrics, genre_list):
        lyrics_emb = self.model.encode([lyrics], convert_to_tensor=True, device=DEVICE)[0]
        genre_embs = self.model.encode(genre_list, convert_to_tensor=True, device=DEVICE)
        similarities = [1 - cosine(lyrics_emb.cpu().numpy(), genre_emb.cpu().numpy()) for genre_emb in genre_embs]
        return genre_list[np.argmax(similarities)]

    def analyze_linguistic_patterns(self, lyrics_data, genres):
        embeddings = self.model.encode(lyrics_data, convert_to_tensor=True, device=DEVICE).cpu().numpy()
        patterns = {}
        for genre in set(genres):
            genre_indices = [i for i, g in enumerate(genres) if g == genre]
            genre_embs = embeddings[genre_indices]
            patterns[genre] = np.mean(genre_embs, axis=0)
        return patterns

In [4]:
# 2.1.3 Hybrid Multi-Modal Classification
class HybridGenreClassifier(nn.Module):
    def __init__(self, audio_dim, text_dim, num_genres):
        super(HybridGenreClassifier, self).__init__()
        self.audio_emb = AudioEmbedding(audio_dim, 128)
        self.text_model = SentenceTransformer('all-MiniLM-L6-v2', device=DEVICE)
        self.attention = CrossModalAttention(audio_dim=128, text_dim=text_dim, attn_dim=128)
        self.classifier = nn.Linear(128 + text_dim + text_dim, num_genres)  # 128 from attention, 384 text, 384 metadata

    def forward(self, audio_features, lyrics, metadata):
        audio_emb = self.audio_emb(audio_features)
        text_emb = self.text_model.encode(lyrics, convert_to_tensor=True, device=DEVICE)
        metadata_emb = self.text_model.encode(
            [f"{m[0]} {m[1]}" for m in metadata],
            convert_to_tensor=True,
            device=DEVICE
        )
        fused = self.attention(audio_emb, text_emb)
        combined = torch.cat([fused, text_emb, metadata_emb], dim=-1)
        return self.classifier(combined)

    def get_confidence_scores(self, outputs):
        return torch.softmax(outputs, dim=-1)

def compare_with_audio_only(hybrid_model, audio_only_model, test_data, lyrics, metadata, labels):
    hybrid_preds = hybrid_model(test_data, lyrics, metadata)
    audio_preds = audio_only_model(test_data)
    hybrid_metrics = precision_recall_fscore_support(labels.cpu().numpy(), torch.argmax(hybrid_preds, dim=1).cpu().numpy(), average='weighted')
    audio_metrics = precision_recall_fscore_support(labels.cpu().numpy(), torch.argmax(audio_preds, dim=1).cpu().numpy(), average='weighted')
    return hybrid_metrics, audio_metrics

In [5]:
# 2.2 Transformer-Based Audio Classification
class AudioSpectrogramTransformer(nn.Module):
    def __init__(self, patch_size=16, in_channels=1, embed_dim=768, num_heads=12, num_layers=12, num_classes=10):
        super(AudioSpectrogramTransformer, self).__init__()
        self.patch_embedding = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads), num_layers=num_layers
        )
        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, spectrogram):
        patches = self.patch_embedding(spectrogram)
        patches = patches.flatten(2).transpose(1, 2)
        transformer_output = self.transformer(patches)
        return self.classifier(transformer_output[:, 0])

def visualize_attention_patterns(model, spectrogram):
    with torch.no_grad():
        patches = model.patch_embedding(spectrogram).flatten(2).transpose(1, 2)
        attention = model.transformer.layers[-1].self_attn(patches, patches, patches)[1]
        plt.figure(figsize=(10, 8))
        sns.heatmap(attention[0].cpu().numpy(), cmap='viridis')
        plt.title('Attention Patterns')
        plt.savefig(os.path.join(OUTPUT_DIR, 'attention_patterns.png'))
        plt.close()

class CNNBaseline(nn.Module):
    def __init__(self, num_classes=10):
        super(CNNBaseline, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 64, 3),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, 3),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.fc = nn.Linear(128 * 30 * 30, num_classes)  # Adjusted for 128x128 input

    def forward(self, x):
        x = self.conv(x)
        x = x.flatten(1)
        return self.fc(x)

In [6]:
# 3.1 Semantic Music Search
class MusicSearchSystem:
    def __init__(self):
        self.text_model = SentenceTransformer('all-MiniLM-L6-v2', device=DEVICE)
        self.audio_extractor = AudioFeatureExtractor()
        self.intent_classifier = nn.Linear(384, 5)  # 5 intent classes

    def process_query(self, query):
        query_emb = self.text_model.encode([query], convert_to_tensor=True, device=DEVICE)[0]
        intent = torch.softmax(self.intent_classifier(query_emb), dim=-1)
        expanded_query = self.expand_query(query)
        return intent, expanded_query

    def expand_query(self, query):
        synonyms = self.text_model.encode([f'similar to {query}', f'like {query}'], convert_to_tensor=True, device=DEVICE)
        return synonyms.mean(dim=0)

    def multi_modal_search(self, query, audio_features, lyrics, metadata, df_metadata):
        query_emb = self.text_model.encode([query], convert_to_tensor=True, device=DEVICE)[0].cpu().numpy()
        audio_embs = torch.tensor(audio_features, dtype=torch.float32, device=DEVICE)
        lyrics_embs = self.text_model.encode(lyrics, convert_to_tensor=True, device=DEVICE).cpu().numpy()
        scores = []
        for i in range(len(lyrics)):
            text_score = 1 - cosine(query_emb, lyrics_embs[i])
            audio_score = 1 - cosine(query_emb, audio_embs[i].cpu().numpy())
            metadata_score = self.metadata_similarity(query_emb, metadata[i])
            scores.append(0.4 * text_score + 0.4 * audio_score + 0.2 * metadata_score)
        top_indices = np.argsort(scores)[::-1][:10]
        return df_metadata.iloc[top_indices][['track_id', 'artist_name', 'title', 'genre']]

    def metadata_similarity(self, query_emb, metadata):
        metadata_emb = self.text_model.encode([f"{metadata[0]} {metadata[1]}"], convert_to_tensor=True, device=DEVICE)[0].cpu().numpy()
        return 1 - cosine(query_emb, metadata_emb)

# 3.1.3 Content-Based Music Discovery
class MusicDiscovery:
    def __init__(self):
        self.text_model = SentenceTransformer('all-MiniLM-L6-v2', device=DEVICE)
        self.audio_model = AudioSpectrogramTransformer()
        self.audio_extractor = AudioFeatureExtractor()
        self.mood_classifier = nn.Linear(384, 5)  # 5 mood classes
        self.energy_predictor = nn.Linear(40, 1)
        self.danceability_predictor = nn.Linear(40, 1)

    def generate_tags(self, audio_path, lyrics):
        audio_features = self.audio_extractor.extract_features(audio_path)
        audio_features = torch.tensor(audio_features, dtype=torch.float32, device=DEVICE)
        lyrics_emb = self.text_model.encode([lyrics], convert_to_tensor=True, device=DEVICE)[0]
        mood_scores = torch.softmax(self.mood_classifier(lyrics_emb), dim=-1)
        energy = torch.sigmoid(self.energy_predictor(audio_features))
        danceability = torch.sigmoid(self.danceability_predictor(audio_features))
        return {'mood': mood_scores.cpu().numpy(), 'energy': energy.cpu().numpy(), 'danceability': danceability.cpu().numpy()}

    def generate_playlist(self, seed_song, music_collection, df_metadata):
        seed_emb = self.get_song_embedding(seed_song)
        similarities = [1 - cosine(seed_emb, self.get_song_embedding(song)) for song in music_collection]
        top_indices = np.argsort(similarities)[::-1][:10]
        return df_metadata.iloc[top_indices][['track_id', 'artist_name', 'title', 'genre']]

    def get_song_embedding(self, song):
        audio_features = self.audio_extractor.extract_features(song['audio_path'])
        lyrics_emb = self.text_model.encode([song['lyrics']], convert_to_tensor=True, device=DEVICE)[0].cpu().numpy()
        return np.concatenate([audio_features, lyrics_emb])


In [7]:
# 4.1 Personalized Recommendation Engines
class MusicRecommender(nn.Module):
    def __init__(self, audio_dim, text_dim, num_tracks):
        super(MusicRecommender, self).__init__()
        self.audio_layer = nn.Linear(audio_dim, 128)
        self.text_layer = nn.Linear(text_dim, 128)
        self.user_model = nn.Linear(128, 128)
        self.item_model = nn.Linear(128, 128)
        self.fc = nn.Linear(128 * 2, num_tracks)

    def build_user_profile(self, listening_history, ratings):
        history_emb = self.text_model.encode(listening_history, convert_to_tensor=True, device=DEVICE).cpu().numpy()
        weighted_emb = np.average(history_emb, weights=ratings, axis=0)
        return self.user_model(torch.tensor(weighted_emb, dtype=torch.float32, device=DEVICE))

    def forward(self, audio_features, text_features):
        audio_out = torch.relu(self.audio_layer(audio_features))
        text_out = torch.relu(self.text_layer(text_features))
        combined = torch.cat([audio_out, text_out], dim=1)
        return torch.sigmoid(self.fc(combined))

    def generate_explanation(self, user_id, item_id, df_metadata):
        user_profile = self.build_user_profile(df_metadata['title'].tolist(), df_metadata['rating'].tolist())
        item_idx = df_metadata[df_metadata['track_id'] == item_id].index[0]
        item_genre = df_metadata.loc[item_idx, 'genre']
        return f"Recommended for user {user_id} because of interest in {item_genre} and similar audio characteristics."

    def get_item_features(self, item_id):
        # Placeholder, to be replaced with actual feature extraction
        return torch.zeros(128, device=DEVICE)

class HybridRecommender(nn.Module):
    def __init__(self, audio_dim, text_dim, num_tracks):
        super(HybridRecommender, self).__init__()
        self.text_model = SentenceTransformer('all-MiniLM-L6-v2', device=DEVICE)
        self.audio_layer = nn.Linear(audio_dim, 128)
        self.text_layer = nn.Linear(text_dim, 128)
        self.fc = nn.Linear(128 * 2, num_tracks)

    def forward(self, audio_features, text_features):
        audio_out = torch.relu(self.audio_layer(audio_features))
        text_out = torch.relu(self.text_layer(text_features))
        combined = torch.cat([audio_out, text_out], dim=1)
        return torch.sigmoid(self.fc(combined))

    def recommend(self, user_id, context, audio_features, text_features, df_metadata):
        context_emb = self.text_model.encode([context], convert_to_tensor=True, device=DEVICE)
        scores = self.forward(audio_features, text_features + context_emb)
        top_indices = torch.argsort(scores, dim=1, descending=True)[:, :10].cpu().numpy().flatten()
        return df_metadata.iloc[top_indices][['track_id', 'artist_name', 'title', 'genre']]

    def optimize_diversity(self, recommendations, df_metadata):
        genre_counts = recommendations['genre'].value_counts()
        diversity_score = len(genre_counts) / len(recommendations)
        if diversity_score < 0.5:
            diverse_indices = []
            for genre in df_metadata['genre'].unique():
                genre_recs = recommendations[recommendations['genre'] == genre]
                if not genre_recs.empty:
                    diverse_indices.append(genre_recs.index[0])
            return df_metadata.iloc[diverse_indices]
        return recommendations

In [8]:
# 5.1 Comprehensive Evaluation Framework
class EvaluationFramework:
    def __init__(self):
        self.metrics = {}

    def evaluate_classification(self, y_true, y_pred, genres):
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=genres, yticklabels=genres)
        plt.title('Confusion Matrix')
        plt.savefig(os.path.join(OUTPUT_DIR, 'classification_confusion_matrix.png'))
        plt.close()
        return {'precision': precision, 'recall': recall, 'f1': f1}

    def evaluate_retrieval(self, relevant_items, retrieved_items, k_values=[5, 10, 20]):
        metrics = {}
        for k in k_values:
            k = min(k, len(retrieved_items))
            precision = len(set(retrieved_items[:k]) & set(relevant_items)) / k if k > 0 else 0
            recall = len(set(retrieved_items[:k]) & set(relevant_items)) / len(relevant_items) if relevant_items else 0
            metrics[f'P@{k}'] = precision
            metrics[f'R@{k}'] = recall
        ap = self.calculate_map(relevant_items, retrieved_items)
        ndcg = self.calculate_ndcg(relevant_items, retrieved_items)
        metrics['MAP'] = ap
        metrics['NDCG'] = ndcg
        return metrics

    def evaluate_recommendation(self, recommendations, user_interactions):
        ctr = sum(1 for rec in recommendations['track_id'] if rec in user_interactions) / len(recommendations) if len(recommendations) > 0 else 0
        diversity = len(recommendations['genre'].unique()) / len(recommendations) if len(recommendations) > 0 else 0
        novelty = 1 - sum(1 for rec in recommendations['track_id'] if rec in user_interactions) / len(recommendations) if len(recommendations) > 0 else 0
        return {'CTR': ctr, 'Diversity': diversity, 'Novelty': novelty}

    def calculate_map(self, relevant, retrieved):
        ap = 0
        relevant_set = set(relevant)
        for i, item in enumerate(retrieved):
            if item in relevant_set:
                ap += len(set(retrieved[:i+1]) & relevant_set) / (i + 1)
        return ap / len(relevant) if relevant else 0

    def calculate_ndcg(self, relevant, retrieved):
        dcg = 0
        idcg = sum(1 / np.log2(i + 2) for i in range(len(relevant)))
        for i, item in enumerate(retrieved):
            if item in relevant:
                dcg += 1 / np.log2(i + 2)
        return dcg / idcg if idcg > 0 else 0

In [None]:
# Main Execution
def main():
    # Setup dataset
    if not os.path.exists(DATA_PATH) or not os.path.exists(METADATA_PATH):
        print("Dataset not found. Setting up FMA dataset...")
        setup_fma_dataset()

    # Load data
    df_metadata, df_features, lyrics_dict = load_fma_data(AUDIO_PATH, METADATA_PATH, ARTISTS_PATH, GENRES_PATH, LYRICS_PATH, TAGS_PATH)
    if df_metadata.empty or df_features.empty:
        print("Error: No valid data loaded.")
        return

    # Generate text embeddings
    text_embeddings = generate_text_embeddings(lyrics_dict) if lyrics_dict else {tid: np.zeros(384) for tid in df_metadata['track_id']}
    audio_features = df_features[[col for col in df_features.columns if col != 'track_id']].values
    text_features = np.array([text_embeddings.get(tid, np.zeros(384)) for tid in df_metadata['track_id']])
    genres = df_metadata['genre'].unique()
    genre_to_idx = {g: i for i, g in enumerate(genres)}
    labels = df_metadata['genre'].map(genre_to_idx).values

    # Load ratings
    num_tracks = len(df_metadata)
    track_id_to_idx = {tid: idx for idx, tid in enumerate(df_metadata['track_id'])}
    ratings_matrix = np.zeros((num_tracks, num_tracks))
    if os.path.exists(USER_DATA_PATH):
        user_data = pd.read_csv(USER_DATA_PATH)
        user_data['track_id'] = user_data['track_id'].astype(str).str.zfill(6)
        user_data = user_data[user_data['track_id'].isin(df_metadata['track_id'])]
        for _, row in user_data.iterrows():
            tid = row['track_id']
            if tid in track_id_to_idx:
                ratings_matrix[track_id_to_idx[tid], track_id_to_idx[tid]] = row['rating']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        np.hstack([audio_features, text_features]), labels, test_size=0.2, random_state=42
    )
    X_train_rec, X_test_rec, y_train_rec, y_test_rec = train_test_split(
        np.hstack([audio_features, text_features]), ratings_matrix, test_size=0.2, random_state=42
    )

    # Classification data loaders
    train_dataset_cls = torch.utils.data.TensorDataset(
        torch.tensor(X_train[:, :audio_features.shape[1]], dtype=torch.float32),
        torch.tensor(X_train[:, audio_features.shape[1]:], dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.long)
    )
    train_loader_cls = torch.utils.data.DataLoader(train_dataset_cls, batch_size=BATCH_SIZE, shuffle=True)
    test_dataset_cls = torch.utils.data.TensorDataset(
        torch.tensor(X_test[:, :audio_features.shape[1]], dtype=torch.float32),
        torch.tensor(X_test[:, audio_features.shape[1]:], dtype=torch.float32),
        torch.tensor(y_test, dtype=torch.long)
    )
    test_loader_cls = torch.utils.data.DataLoader(test_dataset_cls, batch_size=BATCH_SIZE)

    # Recommendation data loaders
    train_dataset_rec = torch.utils.data.TensorDataset(
        torch.tensor(X_train_rec[:, :audio_features.shape[1]], dtype=torch.float32),
        torch.tensor(X_train_rec[:, audio_features.shape[1]:], dtype=torch.float32),
        torch.tensor(y_train_rec, dtype=torch.float32)
    )
    train_loader_rec = torch.utils.data.DataLoader(train_dataset_rec, batch_size=BATCH_SIZE, shuffle=True)
    test_dataset_rec = torch.utils.data.TensorDataset(
        torch.tensor(X_test_rec[:, :audio_features.shape[1]], dtype=torch.float32),
        torch.tensor(X_test_rec[:, audio_features.shape[1]:], dtype=torch.float32),
        torch.tensor(y_test_rec, dtype=torch.float32)
    )
    test_loader_rec = torch.utils.data.DataLoader(test_dataset_rec, batch_size=BATCH_SIZE)

    # 2.1.1: Audio Feature Contribution Analysis
    audio_model = AudioEmbedding(input_dim=audio_features.shape[1]).to(DEVICE)
    features_tensor = torch.tensor(audio_features, dtype=torch.float32, device=DEVICE)
    labels_tensor = torch.tensor(labels, dtype=torch.long, device=DEVICE)
    contributions = analyze_feature_contribution(audio_model, features_tensor, labels_tensor)
    print("\nFeature Contributions:")
    print(contributions)

    # 2.1.2: Text-Based Genre Classification
    text_classifier = TextGenreClassifier(num_classes=len(genres)).to(DEVICE)
    lyrics_list = list(lyrics_dict.values()) if lyrics_dict else ['music'] * len(df_metadata)
    text_classifier.fine_tune(lyrics_list[:len(X_train)], torch.tensor(y_train, dtype=torch.long, device=DEVICE))
    zero_shot_result = text_classifier.zero_shot_classify("upbeat rock song", genres.tolist())
    print("\nZero-Shot Classification Result:", zero_shot_result)
    patterns = text_classifier.analyze_linguistic_patterns(lyrics_list, df_metadata['genre'].tolist())
    print("\nLinguistic Patterns:", {k: np.mean(v) for k, v in patterns.items()})

    # 2.1.3: Hybrid Genre Classification
    classifier = HybridGenreClassifier(audio_dim=audio_features.shape[1], text_dim=384, num_genres=len(genres)).to(DEVICE)
    criterion_cls = nn.CrossEntropyLoss()
    optimizer_cls = torch.optim.Adam(classifier.parameters(), lr=0.001)
    for epoch in range(NUM_EPOCHS):
        classifier.train()
        total_loss = 0
        for audio_features, _, labels in train_loader_cls:  # Ignore text_features from loader
            audio_features, labels = audio_features.to(DEVICE), labels.to(DEVICE)
            lyrics_batch = lyrics_list[:len(audio_features)]  # Simplified, assumes aligned data
            metadata_batch = df_metadata[['artist_name', 'title']].iloc[:len(audio_features)].values
            optimizer_cls.zero_grad()
            outputs = classifier(audio_features, lyrics_batch, metadata_batch)
            loss = criterion_cls(outputs, labels)
            loss.backward()
            optimizer_cls.step()
            total_loss += loss.item()
        print(f"Hybrid Classification Epoch {epoch+1}, Loss: {total_loss/len(train_loader_cls):.4f}")

    classifier.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for audio_features, _, labels in test_loader_cls:
            audio_features, labels = audio_features.to(DEVICE), labels.to(DEVICE)
            lyrics_batch = lyrics_list[:len(audio_features)]
            metadata_batch = df_metadata[['artist_name', 'title']].iloc[:len(audio_features)].values
            outputs = classifier(audio_features, lyrics_batch, metadata_batch)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
    precision_cls, recall_cls, f1_cls, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    print(f"Hybrid Classification Precision: {precision_cls:.4f}, Recall: {recall_cls:.4f}, F1: {f1_cls:.4f}")
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=genres, yticklabels=genres)
    plt.title('Hybrid Classification Confusion Matrix')
    plt.savefig(os.path.join(OUTPUT_DIR, 'hybrid_confusion_matrix.png'))
    plt.close()

    # Instantiate the GenreClassifier
    audio_only_model = GenreClassifier(
        audio_dim=audio_features.shape[1],
        text_dim=384,  # Still required, but will be filled with zeros
        num_classes=len(genres)
    ).to(DEVICE)

    # Prepare dummy lyrics (list of strings)
    dummy_lyrics = ["music"] * len(X_test)

    # Compare hybrid vs audio-only
    hybrid_metrics, audio_metrics = compare_with_audio_only(
        classifier,
        audio_only_model,
        torch.tensor(X_test[:, :audio_features.shape[1]], dtype=torch.float32, device=DEVICE),
        dummy_lyrics,  # ✅ Correct
        df_metadata[['artist_name', 'title']].iloc[:len(X_test)].values,
        torch.tensor(y_test, dtype=torch.long, device=DEVICE)
    )

    print(f"\nHybrid vs Audio-Only Metrics: Hybrid={hybrid_metrics}, Audio-Only={audio_metrics}")

    # 2.2: Transformer-Based Audio Classification
    def generate_spectrograms(audio_path, track_ids):
        spectrograms = []
        for track_id in track_ids:
            audio_file = os.path.join(audio_path, f"{track_id}.mp3")
            if os.path.exists(audio_file):
                y, sr = librosa.load(audio_file, sr=22050)
                spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
                spec_db = librosa.power_to_db(spec, ref=np.max)
                if spec_db.shape[1] > 128:
                    spec_db = spec_db[:, :128]
                else:
                    spec_db = np.pad(spec_db, ((0, 0), (0, 128 - spec_db.shape[1])), mode='constant')
                spectrograms.append(spec_db)
            else:
                spectrograms.append(np.zeros((128, 128)))
        return np.array(spectrograms)

    spectrograms = generate_spectrograms(AUDIO_PATH, df_metadata['track_id'].tolist())
    spectrograms = spectrograms[:, np.newaxis, :, :]  # Add channel dimension
    X_train_spec, X_test_spec, y_train_spec, y_test_spec = train_test_split(
        spectrograms, labels, test_size=0.2, random_state=42
    )
    train_dataset_spec = torch.utils.data.TensorDataset(
        torch.tensor(X_train_spec, dtype=torch.float32), torch.tensor(y_train_spec, dtype=torch.long)
    )
    train_loader_spec = torch.utils.data.DataLoader(train_dataset_spec, batch_size=BATCH_SIZE, shuffle=True)
    test_dataset_spec = torch.utils.data.TensorDataset(
        torch.tensor(X_test_spec, dtype=torch.float32), torch.tensor(y_test_spec, dtype=torch.long)
    )
    test_loader_spec = torch.utils.data.DataLoader(test_dataset_spec, batch_size=BATCH_SIZE)

    ast = AudioSpectrogramTransformer(num_classes=len(genres)).to(DEVICE)
    criterion_ast = nn.CrossEntropyLoss()
    optimizer_ast = torch.optim.Adam(ast.parameters(), lr=0.001)
    for epoch in range(NUM_EPOCHS):
        ast.train()
        total_loss = 0
        for specs, labels in train_loader_spec:
            specs, labels = specs.to(DEVICE), labels.to(DEVICE)
            optimizer_ast.zero_grad()
            outputs = ast(specs)
            loss = criterion_ast(outputs, labels)
            loss.backward()
            optimizer_ast.step()
            total_loss += loss.item()
        print(f"AST Epoch {epoch+1}, Loss: {total_loss/len(train_loader_spec):.4f}")

    ast.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for specs, labels in test_loader_spec:
            specs, labels = specs.to(DEVICE), labels.to(DEVICE)
            outputs = ast(specs)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
    ast_metrics = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    print(f"AST Precision: {ast_metrics[0]:.4f}, Recall: {ast_metrics[1]:.4f}, F1: {ast_metrics[2]:.4f}")
    visualize_attention_patterns(ast, torch.tensor(X_test_spec[:1], dtype=torch.float32, device=DEVICE))

    # 3.1: Semantic Music Search
    search_system = MusicSearchSystem()
    query = "upbeat rock songs"
    results = search_system.multi_modal_search(query, audio_features, list(lyrics_dict.values()), df_metadata[['artist_name', 'title']].values, df_metadata)
    print("\nSearch Results:")
    print(results)

    # 3.1.3: Content-Based Discovery
    discovery = MusicDiscovery()
    sample_audio = os.path.join(AUDIO_PATH, df_metadata['track_id'].iloc[0] + '.mp3')
    sample_lyrics = lyrics_dict.get(df_metadata['track_id'].iloc[0], 'music')
    tags = discovery.generate_tags(sample_audio, sample_lyrics)
    print("\nGenerated Tags:")
    print(tags)
    playlist = discovery.generate_playlist({'audio_path': sample_audio, 'lyrics': sample_lyrics}, df_metadata.to_dict('records'), df_metadata)
    print("\nGenerated Playlist:")
    print(playlist)

    # 4.1: Recommendation
    recommender = HybridRecommender(audio_dim=audio_features.shape[1], text_dim=384, num_tracks=num_tracks).to(DEVICE)
    criterion_rec = nn.BCELoss()
    optimizer_rec = torch.optim.Adam(recommender.parameters(), lr=0.001)
    for epoch in range(NUM_EPOCHS):
        loss_rec = train_recommender(recommender, train_loader_rec, criterion_rec, optimizer_rec)
        print(f"Recommendation Epoch {epoch+1}, Loss: {loss_rec:.4f}")

    precision_rec = evaluate_recommender(recommender, test_loader_rec)
    print(f"Recommendation Precision@10: {precision_rec:.4f}")
    recommendations = recommender.recommend('user_001', 'upbeat', torch.tensor(audio_features, dtype=torch.float32, device=DEVICE),
                                          torch.tensor(text_features, dtype=torch.float32, device=DEVICE), df_metadata)
    recommendations = recommender.optimize_diversity(recommendations, df_metadata)
    print("\nRecommendations:")
    print(recommendations)

    # Generate explanation for a sample recommendation
    music_recommender = MusicRecommender(audio_dim=audio_features.shape[1], text_dim=384, num_tracks=num_tracks).to(DEVICE)
    sample_item_id = recommendations['track_id'].iloc[0]
    explanation = music_recommender.generate_explanation('user_001', sample_item_id, pd.concat([df_metadata, user_data[['rating']]], axis=1))
    print("\nRecommendation Explanation:")
    print(explanation)

    # 5.1: Evaluation
    evaluator = EvaluationFramework()
    retrieval_metrics = evaluator.evaluate_retrieval(df_metadata[df_metadata['genre'] == 'Rock']['track_id'].tolist()[:10], results['track_id'].tolist())
    print("\nRetrieval Metrics:")
    print(retrieval_metrics)
    rec_metrics = evaluator.evaluate_recommendation(recommendations, user_data[user_data['rating'] > 0.5]['track_id'].tolist())
    print("\nRecommendation Metrics:")
    print(rec_metrics)

if __name__ == "__main__":
    main()

Dataset not found. Setting up FMA dataset...
Cloning FMA repository...
Downloading fma_small.zip...
