<a href="https://colab.research.google.com/github/Viditk07-Bits/AudioAnalytics_S2-24_AIMLCZG527/blob/main/AA_Assignment2_Final_V3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Music Information Retrieval System

## Assignment Objective
This assignment implements a comprehensive Music Information Retrieval (MIR) system using Large Language Models (LLMs) and deep learning techniques. It includes music recommendation, genre classification, and semantic search applications, combining audio analysis with natural language processing.

## Dataset Setup
Using the Free Music Archive (FMA) dataset with audio files, metadata, and synthetic user data.

In [8]:
import subprocess
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoModelForAudioClassification, AutoFeatureExtractor, Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizer, GPT2LMHeadModel, GPT2Tokenizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, average_precision_score, ndcg_score
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from nltk.translate.bleu_score import sentence_bleu
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import cosine
import os
from pathlib import Path
import librosa
import logging
import random
from collections import Counter
from google.colab import drive
from datasets import Dataset
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from multiprocessing import Pool
import faiss
from imblearn.over_sampling import SMOTE
import time
import streamlit as st
import unittest
import configparser
import warnings
import logging.handlers
import timeout_decorator

# Install required packages
required_packages = ['faiss-cpu', 'transformers', 'sentence-transformers', 'tqdm', 'imblearn', 'librosa', 'nltk', 'matplotlib', 'seaborn', 'datasets', 'streamlit', 'scikit-learn', 'numpy', 'torch', 'timeout-decorator']
for pkg in required_packages:
    try:
        __import__(pkg.replace('-', '_'))
    except ImportError:
        print(f"Installing {pkg}...")
        try:
            subprocess.run(['pip', 'install', pkg, '--no-cache-dir'], check=True, capture_output=True, text=True)
            print(f"Successfully installed {pkg}")
        except subprocess.CalledProcessError as e:
            print(f"Failed to install {pkg}: {e.stderr}")
            raise

# Download NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)

# Setup logging with rotation
log_dir = "/content/logs"
os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.handlers.RotatingFileHandler(os.path.join(log_dir, 'music_recommender.log'), maxBytes=1000000, backupCount=5),
        logging.StreamHandler()
    ]
)

# Constants
DATA_PATH = "/content/drive/MyDrive/fma_metadata"
METADATA_PATH = os.path.join(DATA_PATH, "tracks.csv")
ARTISTS_PATH = os.path.join(DATA_PATH, "artists.csv")
GENRES_PATH = os.path.join(DATA_PATH, "genres.csv")
LYRICS_PATH = "/content/fma/lyrics"
USER_DATA_PATH = "/content/fma/user_data/ratings.csv"
TAGS_PATH = "/content/fma/descriptions/tags.csv"
OUTPUT_DIR = "/content/outputs"
TEMP_DIR = "/content/fma"
NUM_EPOCHS_REC = 5
NUM_EPOCHS_CLS = 10
BATCH_SIZE = 16
MAX_TRACKS = 100
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TIMEOUT_SECONDS = 3600
TEXT_EMBEDDING_DIM = 384
FMA_BASE_DIR = "/content/drive/MyDrive/fma_small/"
FMA_AUDIO_DIRS = [str(p) for p in Path(FMA_BASE_DIR).glob("*") if p.is_dir()]
if not FMA_AUDIO_DIRS:
    logging.warning("No subdirectories found in %s", FMA_BASE_DIR)
    print(f"Warning: No subdirectories found in {FMA_BASE_DIR}")

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Genre-specific keywords
GENRE_KEYWORDS = {
    'Rock': ['electric guitar wails', 'rebellious spirit soars', 'grunge heart pounds', 'classic riffs ignite', 'indie soul rebels', 'punk fire explodes', 'rock anthem roars'],
    'Pop': ['infectious hooks dance', 'neon lights pulse', 'melodic dreams soar', 'upbeat rhythm shines', 'love story sparkles', 'dancefloor beats throb', 'pop fever rises'],
    'Jazz': ['saxophone weaves magic', 'improvised notes flow', 'bluesy soul swings', 'smooth grooves linger', 'jazz night whispers', 'rhythmic scat hums', 'cool vibes drift'],
    'Classical': ['orchestral swells rise', 'violin sings softly', 'piano echoes grace', 'symphonic waves crash', 'baroque harmony soars', 'elegant strings weave', 'timeless beauty unfolds'],
    'Hip-Hop': ['heavy beats drop hard', 'sharp rhymes cut deep', 'street stories unfold', 'flow rides the rhythm', 'urban pulse vibrates', 'mic drops with swagger', 'hip-hop reigns supreme'],
    'Electronic': ['synth pulses glow', 'techno beats surge', 'ambient waves drift', 'EDM sparks the night', 'futuristic sounds hum', 'electro vibes ignite', 'digital dreams pulse'],
    'Folk': ['acoustic chords strum', 'heartfelt tales weave', 'rustic paths wander', 'folk roots run deep', 'gentle melodies soothe', 'campfire stories sing', 'tradition lives on'],
    'Blues': ['guitar wails with soul', 'heartache spills over', 'raw blues cry out', 'delta notes resonate', 'mournful chords linger', 'blues spirit endures', 'emotional strings weep'],
    'Country': ['banjo twangs with pride', 'heartland stories sing', 'cowboy boots stomp', 'rural roads ramble', 'love songs ride free', 'country heart beats strong', 'honky-tonk nights shine'],
    'Reggae': ['rasta riddims sway', 'island vibes chill', 'roots reggae grooves', 'one love unites all', 'skank beat lifts high', 'irie spirit flows', 'dreadlocks dance free'],
    'International': ['world rhythms blend', 'exotic melodies soar', 'cultural beats pulse', 'global sounds unite', 'traditional chants echo', 'fusion vibes transcend', 'earth’s heartbeat sings'],
    'Instrumental': ['ambient chords float', 'strings weave dreams', 'piano paints silence', 'orchestral tides rise', 'melody speaks alone', 'instrumental soul soars', 'soundscapes breathe life'],
    'Experimental': ['avant-garde sounds twist', 'abstract beats morph', 'sonic boundaries break', 'unorthodox rhythms pulse', 'experimental vibes soar', 'sound art redefines', 'future notes unfold']
}

@timeout_decorator.timeout(30, timeout_exception=TimeoutError)
def generate_lyrics(args):
    """Generate creative lyrics using GPT-2 for a given track and genre."""
    track_id, genre, lyrics_path, tokenizer, model = args
    try:
        if tokenizer.pad_token_id is None:
            tokenizer.pad_token_id = tokenizer.eos_token_id
        keywords = random.sample(GENRE_KEYWORDS.get(genre, ['music vibes']), min(3, len(GENRE_KEYWORDS.get(genre, ['music vibes']))))
        prompt = f"Create {genre} lyrics with vivid imagery and emotional depth, inspired by: {', '.join(keywords)}. Avoid repeating the prompt words."
        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(DEVICE)
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=150,
            num_return_sequences=1,
            no_repeat_ngram_size=3,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.9,
            pad_token_id=tokenizer.pad_token_id
        )
        lyrics = tokenizer.decode(outputs[0], skip_special_tokens=True)
        lyrics = lyrics.replace(prompt, "").strip()
    except TimeoutError:
        logging.warning("Lyrics generation timed out for track %s", track_id)
        lyrics = f"{genre} lyrics placeholder"
    except Exception as e:
        logging.warning("Lyrics generation failed for track %s: %s", track_id, str(e))
        lyrics = f"{genre} lyrics placeholder"
    os.makedirs(lyrics_path, exist_ok=True)
    with open(os.path.join(lyrics_path, f"{track_id}.txt"), 'w', encoding='utf-8') as f:
        f.write(lyrics)
    return track_id

def evaluate_lyrics_quality(lyrics_dict, reference_lyrics=None):
    """Evaluate generated lyrics quality using BLEU score."""
    if not reference_lyrics:
        logging.warning("No reference lyrics provided for BLEU evaluation; returning 0.0")
        return 0.0
    bleu_scores = []
    for track_id, generated in lyrics_dict.items():
        reference = reference_lyrics.get(track_id, '')
        if reference:
            try:
                score = sentence_bleu([reference.split()], generated.split(), weights=(0.5, 0.5))
                bleu_scores.append(score)
            except Exception as e:
                logging.warning("BLEU score calculation failed for track %s: %s", track_id, str(e))
    return np.mean(bleu_scores) if bleu_scores else 0.0

def extract_audio_features(audio_path):
    """Extract audio features using Librosa or generate placeholder features."""
    try:
        if not audio_path or not os.path.exists(audio_path) or os.path.getsize(audio_path) < 100:
            return np.random.randn(26) * 0.1
        y, sr = librosa.load(audio_path, sr=22050)
        if len(y) == 0:
            return np.random.randn(26) * 0.1
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=12)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        tempo = librosa.feature.tempo(y=y, sr=sr)
        tempo = float(tempo[0]) if isinstance(tempo, np.ndarray) else float(tempo)
        features = np.concatenate([
            np.mean(mfccs, axis=1),
            np.mean(chroma, axis=1),
            np.mean(spectral_centroid, axis=1),
            [tempo]
        ])
        return features
    except Exception as e:
        logging.warning("Error processing %s: %s", audio_path, str(e))
        return np.random.randn(26) * 0.1

def load_fma_data(audio_dirs, metadata_path, artists_path, genres_path, lyrics_path, tags_path):
    """Load FMA dataset with audio files from specified directories."""
    logging.info("Loading FMA data...")
    print("Loading FMA data...")
    mp3_files = set()
    for audio_dir in audio_dirs:
        if os.path.exists(audio_dir):
            files = [f for f in os.listdir(audio_dir) if f.endswith('.mp3')]
            mp3_files.update(f.replace('.mp3', '') for f in files)
    if not mp3_files:
        logging.warning("No MP3 files found in audio directories: %s", audio_dirs)
        print(f"Warning: No MP3 files found in {audio_dirs}")

    if not os.path.exists(metadata_path):
        logging.error("Metadata file not found at %s", metadata_path)
        raise FileNotFoundError(f"Metadata file not found at {metadata_path}")

    # Load metadata with multi-level headers, skipping malformed first row
    try:
        df_metadata = pd.read_csv(metadata_path, header=[0, 1], skiprows=1, low_memory=False)
    except Exception as e:
        logging.error("Failed to load metadata: %s", str(e))
        print(f"Failed to load metadata: {str(e)}")
        raise

    # Flatten multi-level columns
    df_metadata.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in df_metadata.columns]

    print("\nAvailable columns in tracks.csv:")
    print(df_metadata.columns.tolist())
    print("\nSample data (first 2 rows):")
    print(df_metadata.head(2).to_string())

    # Define possible column names for required fields
    possible_cols = {
        'track_id': ['Unnamed: 0_level_0_track_id', 'Unnamed: 0_level_0_Unnamed: 0_level_1', 'track_id', 'track.id', 'id', 'trackid', 'track'],
        'title': ['title_Unnamed: 52_level_1', 'track_title', 'title', 'track.name', 'name', 'song_title'],
        'artist_id': ['id_Unnamed: 21_level_1', 'artist_id', 'artist.id', 'artist', 'artistid'],
        'genre_top': ['genre_top_Unnamed: 40_level_1', 'track_genre_top', 'genre_top', 'track_genres', 'genres', 'genre', 'track_genre']
    }

    # Find matching columns
    selected_cols = {}
    for key, candidates in possible_cols.items():
        for candidate in candidates:
            matches = [col for col in df_metadata.columns if candidate.lower() in col.lower()]
            if matches:
                selected_cols[key] = matches[0]
                break
        if key not in selected_cols:
            logging.warning("No matching column for %s. Candidates: %s", key, candidates)
            print(f"Warning: No matching column for {key}. Candidates: {candidates}")
            if key == 'genre_top':
                df_metadata['genre'] = 'Unknown'  # Fallback for missing genre
                selected_cols[key] = 'genre'
            elif key == 'title':
                df_metadata['title'] = df_metadata[selected_cols.get('track_id', 'Unnamed: 0_level_0_track_id')].apply(lambda x: f"Track_{x}")
                selected_cols[key] = 'title'
            elif key == 'artist_id':
                df_metadata['artist_id'] = df_metadata[selected_cols.get('track_id', 'Unnamed: 0_level_0_track_id')].apply(lambda x: f"Artist_{x}")
                selected_cols[key] = 'artist_id'
            else:
                logging.error("No matching column found for %s", key)
                print(f"Error: No matching column for {key}. Available columns: {df_metadata.columns.tolist()}")
                raise ValueError(f"No matching column found for {key}")

    # Log selected columns
    logging.info("Selected columns: %s", selected_cols)
    print(f"Selected columns: {selected_cols}")

    # Validate track_id column
    track_id_col = selected_cols['track_id']
    if df_metadata[track_id_col].isnull().all() or df_metadata[track_id_col].eq('track_id').any():
        logging.error("Invalid track_id column: %s contains all nulls or header value", track_id_col)
        print(f"Error: Invalid track_id column: {track_id_col}")
        raise ValueError(f"Invalid track_id column: {track_id_col}")

    # Validate artist_id column
    artist_id_col = selected_cols['artist_id']
    if df_metadata[artist_id_col].isnull().all():
        logging.warning("Artist_id column %s contains all nulls. Assigning fallback values.", artist_id_col)
        print(f"Warning: Artist_id column {artist_id_col} contains all nulls. Assigning fallback values.")
        df_metadata[artist_id_col] = df_metadata[track_id_col].apply(lambda x: f"Artist_{x}")
        selected_cols['artist_id'] = artist_id_col

    # Select and rename columns
    df_metadata = df_metadata[list(selected_cols.values())].dropna(subset=[selected_cols['track_id']])
    df_metadata = df_metadata.rename(columns={
        selected_cols['track_id']: 'track_id',
        selected_cols['title']: 'title',
        selected_cols['artist_id']: 'artist_id',
        selected_cols['genre_top']: 'genre'
    })

    # Standardize track_id and artist_id
    df_metadata['track_id'] = df_metadata['track_id'].astype(str).str.zfill(6)
    df_metadata['artist_id'] = df_metadata['artist_id'].astype(str).str.zfill(6)

    # Filter tracks with available audio files
    df_metadata = df_metadata[df_metadata['track_id'].isin(mp3_files)]
    if df_metadata.empty:
        logging.warning("No tracks found with audio files. Using all metadata tracks up to MAX_TRACKS.")
        print("Warning: No tracks found with audio files. Using all metadata tracks up to MAX_TRACKS.")
        df_metadata = pd.read_csv(metadata_path, header=[0, 1], skiprows=1, low_memory=False)
        df_metadata.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in df_metadata.columns]
        df_metadata = df_metadata[list(selected_cols.values())].dropna(subset=[selected_cols['track_id']])
        df_metadata = df_metadata.rename(columns={
            selected_cols['track_id']: 'track_id',
            selected_cols['title']: 'title',
            selected_cols['artist_id']: 'artist_id',
            selected_cols['genre_top']: 'genre'
        })
        df_metadata['track_id'] = df_metadata['track_id'].astype(str).str.zfill(6)
        df_metadata['artist_id'] = df_metadata['artist_id'].astype(str).str.zfill(6)
        df_metadata = df_metadata.head(MAX_TRACKS)

    if len(df_metadata) > MAX_TRACKS:
        logging.info("Limiting to %d tracks", MAX_TRACKS)
        df_metadata = df_metadata.head(MAX_TRACKS)

    # Load artists
    if os.path.exists(artists_path):
        df_artists = pd.read_csv(artists_path, dtype={'artist_id': str})
        df_artists['artist_id'] = df_artists['artist_id'].str.zfill(6)
    else:
        df_artists = pd.DataFrame({
            'artist_id': df_metadata['artist_id'].unique(),
            'artist_name': [f"Artist_{i}" for i in range(1, len(df_metadata['artist_id'].unique()) + 1)]
        })
    df_metadata = pd.merge(df_metadata, df_artists, on='artist_id', how='left')

    # Load genres
    if os.path.exists(genres_path):
        try:
            df_genres = pd.read_csv(genres_path, dtype={'genre_id': str})
            # Check if 'genre_name' exists in genres.csv
            if 'genre_name' not in df_genres.columns:
                logging.warning("genres.csv does not contain 'genre_name' column. Creating fallback genres DataFrame.")
                print("Warning: genres.csv does not contain 'genre_name' column. Creating fallback genres DataFrame.")
                unique_genres = df_metadata['genre'].dropna().unique()
                df_genres = pd.DataFrame({
                    'genre_id': range(1, len(unique_genres) + 1),
                    'genre_name': unique_genres
                })
            else:
                logging.info("genres.csv loaded successfully with columns: %s", df_genres.columns.tolist())
                print(f"genres.csv loaded successfully with columns: {df_genres.columns.tolist()}")
        except Exception as e:
            logging.error("Failed to load genres.csv: %s", str(e))
            print(f"Error loading genres.csv: {str(e)}")
            unique_genres = df_metadata['genre'].dropna().unique()
            df_genres = pd.DataFrame({
                'genre_id': range(1, len(unique_genres) + 1),
                'genre_name': unique_genres
            })
    else:
        logging.warning("genres.csv not found at %s. Creating fallback genres DataFrame.", genres_path)
        print(f"genres.csv not found at {genres_path}. Creating fallback genres DataFrame.")
        unique_genres = df_metadata['genre'].dropna().unique()
        df_genres = pd.DataFrame({
            'genre_id': range(1, len(unique_genres) + 1),
            'genre_name': unique_genres
        })

    # Log genre data for debugging
    logging.info("Genres DataFrame head:\n%s", df_genres.head().to_string())
    logging.info("Unique genres in metadata: %s", df_metadata['genre'].dropna().unique().tolist())
    print("Genres DataFrame head:\n", df_genres.head().to_string())
    print("Unique genres in metadata:", df_metadata['genre'].dropna().unique().tolist())

    # Merge metadata with genres
    try:
        df_metadata = pd.merge(df_metadata, df_genres, left_on='genre', right_on='genre_name', how='left')
        logging.info("Merged metadata with genres successfully.")
        print("Merged metadata with genres successfully.")
    except Exception as e:
        logging.error("Failed to merge genres with metadata: %s", str(e))
        print(f"Error merging genres with metadata: {str(e)}")
        # Fallback: Assign genre_id based on genre column
        genre_to_id = {g: i + 1 for i, g in enumerate(df_genres['genre_name'])}
        df_metadata['genre_id'] = df_metadata['genre'].map(genre_to_id).fillna(0).astype(int)

    # Handle missing genre_id
    df_metadata['genre_id'] = df_metadata['genre_id'].fillna(0).astype(int)

    # Select final columns
    df_metadata = df_metadata[['track_id', 'artist_name', 'title', 'genre', 'genre_id']].dropna(subset=['genre'])

    # Extract audio features
    features = []
    for track_id in df_metadata['track_id']:
        audio_file = next((os.path.join(audio_dir, f"{track_id}.mp3") for audio_dir in audio_dirs if os.path.exists(os.path.join(audio_dir, f"{track_id}.mp3"))), None)
        features.append([track_id] + extract_audio_features(audio_file).tolist())
    feature_columns = ['track_id'] + [f'mfcc_{i+1}' for i in range(12)] + [f'chroma_{i+1}' for i in range(12)] + ['spectral_centroid', 'tempo']
    df_features = pd.DataFrame(features, columns=feature_columns)

    # Load or generate lyrics
    lyrics_dict = {}
    if os.path.exists(lyrics_path):
        for lyric_file in Path(lyrics_path).glob("*.txt"):
            track_id = lyric_file.stem
            if track_id in set(df_metadata['track_id']):
                with open(lyric_file, 'r', encoding='utf-8') as f:
                    lyrics_dict[track_id] = f.read().strip() or 'music'
    else:
        try:
            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
            model = GPT2LMHeadModel.from_pretrained('gpt2').to(DEVICE)
            lyrics_args = [(track_id, genre, lyrics_path, tokenizer, model) for track_id, genre in zip(df_metadata['track_id'], df_metadata['genre'])]
            with Pool(processes=2) as pool:
                list(tqdm(pool.imap(generate_lyrics, lyrics_args), total=len(lyrics_args), desc="Generating synthetic lyrics"))
        except Exception as e:
            logging.error("Lyrics generation failed: %s", str(e))
            for track_id, genre in zip(df_metadata['track_id'], df_metadata['genre']):
                os.makedirs(lyrics_path, exist_ok=True)
                with open(os.path.join(lyrics_path, f"{track_id}.txt"), 'w', encoding='utf-8') as f:
                    f.write(f"{genre} lyrics placeholder")
        for track_id in df_metadata['track_id']:
            lyric_file = os.path.join(lyrics_path, f"{track_id}.txt")
            if os.path.exists(lyric_file):
                with open(lyric_file, 'r', encoding='utf-8') as f:
                    lyrics_dict[track_id] = f.read().strip() or 'music'

    # Incorporate tags
    if os.path.exists(tags_path):
        df_tags = pd.read_csv(tags_path)
        df_tags['track_id'] = df_tags['track_id'].astype(str).str.zfill(6)
        for _, row in df_tags.iterrows():
            track_id = row['track_id']
            if track_id in set(df_metadata['track_id']):
                lyrics_dict[track_id] = lyrics_dict.get(track_id, '') + " " + str(row['tag'])

    logging.info("Data loaded: Metadata %s, Features %s, Lyrics %d", df_metadata.shape, df_features.shape, len(lyrics_dict))
    print(f"Data loaded: Metadata {df_metadata.shape}, Features {df_features.shape}, Lyrics {len(lyrics_dict)}")
    return df_metadata, df_features, lyrics_dict

def generate_text_embeddings(lyrics_dict):
    """Generate semantic text embeddings using SentenceTransformer."""
    logging.info("Generating text embeddings...")
    model = SentenceTransformer('all-MiniLM-L6-v2', device=DEVICE)
    embeddings = {}
    for track_id, text in tqdm(lyrics_dict.items(), desc="Generating text embeddings"):
        embeddings[track_id] = model.encode(text, convert_to_tensor=True, device=DEVICE).cpu().numpy()
    return embeddings

def analyze_linguistic_patterns(df_metadata, lyrics_dict):
    """Analyze linguistic patterns and topics in lyrics."""
    print("\nLinguistic Analysis:")
    stop_words = set(stopwords.words('english'))
    genre_words = {g: [] for g in df_metadata['genre'].unique()}
    for track_id, text in lyrics_dict.items():
        genre = df_metadata[df_metadata['track_id'] == track_id]['genre'].iloc[0]
        tokens = word_tokenize(text.lower())
        tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
        genre_words[genre].extend(tokens)
    for genre, words in genre_words.items():
        if words:
            top_words = Counter(words).most_common(5)
            print(f"{genre} top words: {top_words}")
        else:
            print(f"{genre}: No words found after preprocessing")
    try:
        vectorizer = CountVectorizer(stop_words='english', min_df=2)
        X = vectorizer.fit_transform(lyrics_dict.values())
        if X.shape[0] > 0 and X.shape[1] > 0:
            lda = LatentDirichletAllocation(n_components=min(5, X.shape[1]), random_state=42)
            lda.fit(X)
            feature_names = vectorizer.get_feature_names_out()
            for i, topic in enumerate(lda.components_):
                top_words = [feature_names[j] for j in topic.argsort()[-5:]]
                print(f"Topic {i}: {top_words}")
        else:
            print("Insufficient data for topic modeling")
    except Exception as e:
        logging.warning("Topic modeling failed: %s", str(e))
        print("Topic modeling failed due to insufficient or invalid data")

class AttentionFusion(nn.Module):
    """Attention-based fusion for audio and text features."""
    def __init__(self, audio_dim, text_dim, hidden_dim):
        super().__init__()
        self.audio_query = nn.Linear(audio_dim, hidden_dim)
        self.text_key = nn.Linear(text_dim, hidden_dim)
        self.text_value = nn.Linear(text_dim, hidden_dim)
        self.scale = 1.0 / (hidden_dim ** 0.5)

    def forward(self, audio_features, text_features):
        query = self.audio_query(audio_features)
        key = self.text_key(text_features)
        value = self.text_value(text_features)
        attention_scores = torch.matmul(query, key.transpose(-2, -1)) * self.scale
        attention_weights = torch.softmax(attention_scores, dim=-1)
        fused = torch.matmul(attention_weights, value)
        return fused

class AudioTextBERTClassifier(nn.Module):
    """DistilBERT with attention-based audio feature integration."""
    def __init__(self, bert_model, audio_dim, num_classes, hidden_dim=128):
        super().__init__()
        self.bert = bert_model
        self.audio_layer = nn.Linear(audio_dim, hidden_dim)
        self.attention = AttentionFusion(audio_dim, bert_model.config.hidden_size, hidden_dim)
        self.combined_layer = nn.Linear(hidden_dim * 2, hidden_dim)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, input_ids, attention_mask, audio_features):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_out = bert_outputs.pooler_output
        audio_out = torch.relu(self.audio_layer(audio_features))
        fused = self.attention(audio_out, text_out)
        combined = torch.cat([fused, audio_out], dim=1)
        combined = torch.relu(self.combined_layer(combined))
        combined = self.dropout(combined)
        return self.fc(combined)

class HybridRecommender(nn.Module):
    """Content-based recommender with DPP for diversity."""
    def __init__(self, audio_dim, text_dim, hidden_dim=128):
        super().__init__()
        self.audio_layer = nn.Linear(audio_dim, hidden_dim)
        self.text_layer = nn.Linear(text_dim, hidden_dim)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim * 2, 1)

    def forward(self, audio_features, text_features):
        audio_out = torch.relu(self.audio_layer(audio_features))
        text_out = torch.relu(self.text_layer(text_features))
        combined = torch.cat([audio_out, text_out], dim=1)
        combined = self.dropout(combined)
        return torch.sigmoid(self.fc(combined))

def dpp_diversity(outputs, k, genres, lambda_tradeoff=0.1):
    """Apply Determinantal Point Process for diverse recommendations."""
    scores = outputs.cpu().numpy().flatten()
    n = len(scores)
    L = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            genre_sim = 1.0 if genres[i] == genres[j] else 0.5
            L[i, j] = genre_sim * np.exp(-((i - j) ** 2) / 2)
    L = L + lambda_tradeoff * np.diag(scores)
    eigenvalues, eigenvectors = np.linalg.eigh(L)
    selected_indices = []
    for _ in range(min(k, n)):
        max_det = -np.inf
        best_idx = None
        for i in range(n):
            if i not in selected_indices:
                temp_indices = selected_indices + [i]
                det = np.prod([eigenvalues[j] for j in temp_indices])
                if det > max_det:
                    max_det = det
                    best_idx = i
        if best_idx is not None:
            selected_indices.append(best_idx)
    return selected_indices[:k]

def train_recommender(model, train_loader, criterion, optimizer):
    """Train the recommender model."""
    model.train()
    total_loss = 0
    for audio_features, text_features, ratings in train_loader:
        audio_features, text_features, ratings = audio_features.to(DEVICE), text_features.to(DEVICE), ratings.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(audio_features, text_features)
        loss = criterion(outputs, ratings.unsqueeze(1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate_recommender(model, test_loader, df_metadata, test_idx_resampled, original_indices_resampled, genres, k=10):
    """Evaluate recommender with DPP for diversity."""
    model.eval()
    precisions, recalls, maps, ndcgs, diversities = [], [], [], [], []
    global train_indices
    train_track_ids = set(df_metadata.iloc[train_indices]['track_id'])
    with torch.no_grad():
        for batch_idx, (audio_features, text_features, ratings) in enumerate(test_loader):
            audio_features, text_features, ratings = audio_features.to(DEVICE), text_features.to(DEVICE), ratings.to(DEVICE)
            outputs = model(audio_features, text_features)
            batch_start = batch_idx * test_loader.batch_size
            batch_end = batch_start + len(audio_features)
            batch_test_idx_resampled = test_idx_resampled[batch_start:batch_end]
            batch_original_indices = original_indices_resampled[batch_test_idx_resampled]
            batch_genres = df_metadata.iloc[batch_original_indices]['genre'].values
            top_k_indices = dpp_diversity(outputs, k, batch_genres)
            top_k_original_indices = batch_original_indices[top_k_indices]
            top_k_ids = df_metadata.iloc[top_k_original_indices]['track_id']
            top_k_genres = df_metadata.iloc[top_k_original_indices]['genre']
            binary_ratings = (ratings > 0.6).float()
            relevant = binary_ratings[top_k_indices]
            precision = relevant.mean().item()
            recall = relevant.sum().item() / binary_ratings.sum().item() if binary_ratings.sum() > 0 else 0.0
            precisions.append(precision)
            recalls.append(recall)
            map_score = average_precision_score(binary_ratings.cpu().numpy(), outputs.cpu().numpy()) if binary_ratings.sum() > 0 else 0.0
            ndcg = ndcg_score(binary_ratings.cpu().numpy().reshape(1, -1), outputs.cpu().numpy().reshape(1, -1), k=k) if binary_ratings.sum() > 0 else 0.0
            maps.append(map_score)
            ndcgs.append(ndcg)
            diversity = len(set(top_k_genres)) / len(genres) if len(genres) > 0 else 1.0
            diversities.append(diversity)
            novelty = 1 - len(set(top_k_ids).intersection(train_track_ids)) / len(top_k_ids) if len(top_k_ids) > 0 else 1.0
    return {
        'precision@k': np.mean(precisions) if precisions else 0.0,
        'recall@k': np.mean(recalls) if recalls else 0.0,
        'map': np.mean(maps) if maps else 0.0,
        'ndcg@k': np.mean(ndcgs) if ndcgs else 0.0,
        'diversity': np.mean(diversities) if diversities else 1.0,
        'novelty': novelty
    }

def baseline_classifiers(X_train, y_train, X_test, y_test, genres):
    """Train and evaluate baseline classifiers (SVM, Random Forest)."""
    svm = SVC(kernel='rbf', random_state=42)
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    svm.fit(X_train, y_train)
    rf.fit(X_train, y_train)
    svm_pred = svm.predict(X_test)
    rf_pred = rf.predict(X_test)
    svm_metrics = precision_recall_fscore_support(y_test, svm_pred, average='weighted', zero_division=0)
    rf_metrics = precision_recall_fscore_support(y_test, rf_pred, average='weighted', zero_division=0)
    return {
        'SVM': {'precision': svm_metrics[0], 'recall': svm_metrics[1], 'f1': svm_metrics[2]},
        'RandomForest': {'precision': rf_metrics[0], 'recall': rf_metrics[1], 'f1': rf_metrics[2]}
    }

def collaborative_filtering(user_data, df_metadata, k=10):
    """Collaborative filtering using SVD with user profiles."""
    valid_track_ids = set(df_metadata['track_id'])
    user_data = user_data[user_data['track_id'].isin(valid_track_ids)]
    if user_data.empty:
        return {'precision@k': 0.0}
    train_data, test_data = train_test_split(user_data, test_size=0.2, random_state=42)
    user_item_matrix = train_data.pivot(index='user_id', columns='track_id', values='rating').fillna(0)
    svd = TruncatedSVD(n_components=20, random_state=42)
    user_features = svd.fit_transform(user_item_matrix)
    item_features = svd.components_
    predictions = np.dot(user_features, item_features)
    predicted_ratings = pd.DataFrame(predictions, index=user_item_matrix.index, columns=user_item_matrix.columns)
    precisions = []
    for user_id in test_data['user_id'].unique():
        user_ratings = test_data[test_data['user_id'] == user_id]
        true_ratings = user_ratings.set_index('track_id')['rating']
        pred_ratings = predicted_ratings.loc[user_id]
        valid_top_k = pred_ratings.sort_values(ascending=False).index[:k]
        relevant = (true_ratings[true_ratings.index.isin(valid_top_k)] > 3.0).astype(int)
        precision = relevant.mean() if len(relevant) > 0 else 0.0
        precisions.append(precision)
    return {'precision@k': np.mean(precisions) if precisions else 0.0}

class GenreClassifier(nn.Module):
    """Custom genre classifier with audio and text features."""
    def __init__(self, audio_dim, text_dim, num_classes, hidden_dim=128):
        super().__init__()
        self.audio_layer = nn.Linear(audio_dim, hidden_dim)
        self.text_layer = nn.Linear(text_dim, hidden_dim)
        self.attention = AttentionFusion(hidden_dim, hidden_dim, hidden_dim)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, audio_features, text_features):
        audio_out = torch.relu(self.audio_layer(audio_features))
        text_out = torch.relu(self.text_layer(text_features))
        fused = self.attention(audio_out, text_out)
        combined = self.dropout(fused)
        return self.fc(combined)

def train_classifier(model, train_loader, criterion, optimizer):
    """Train the classifier model."""
    model.train()
    total_loss = 0
    for audio_features, text_features, labels in train_loader:
        audio_features, text_features, labels = audio_features.to(DEVICE), text_features.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(audio_features, text_features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate_classifier(model, test_loader, genres):
    """Evaluate classifier with detailed error analysis."""
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for audio_features, text_features, labels in test_loader:
            audio_features, text_features, labels = audio_features.to(DEVICE), text_features.to(DEVICE), labels.to(DEVICE)
            outputs = model(audio_features, text_features)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
    metrics = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=genres, yticklabels=genres)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.savefig(os.path.join(OUTPUT_DIR, 'confusion_matrix.png'))
    plt.close()
    errors = [(i, genres[true], genres[pred]) for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true != pred]
    error_counts = Counter((true, pred) for _, true, pred in errors)
    print("\nMisclassification Patterns:")
    for (true, pred), count in error_counts.most_common():
        print(f"True: {true}, Predicted: {pred}, Count: {count}")
    return {'precision': metrics[0], 'recall': metrics[1], 'f1': metrics[2]}

def bert_classifier(df_metadata, text_embeddings, audio_features, genres):
    """DistilBERT-based genre classification with audio features."""
    print("\nTraining DistilBERT Classifier...")
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    bert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(genres))
    model = AudioTextBERTClassifier(bert_model, audio_dim=audio_features.shape[1], num_classes=len(genres)).to(DEVICE)

    # Filter valid rows and ensure genres are in genres list
    valid_rows = df_metadata[['track_id', 'title', 'genre']].dropna(subset=['genre'])
    genres_list = genres.tolist()
    valid_rows = valid_rows[valid_rows['genre'].isin(genres_list)]

    if valid_rows.empty:
        logging.error("No valid rows for DistilBERT classifier after filtering genres")
        print("Error: No valid rows for DistilBERT classifier after filtering genres")
        return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}

    valid_track_ids = valid_rows['track_id'].values
    logging.info("Number of valid tracks for DistilBERT: %d", len(valid_track_ids))
    print(f"Number of valid tracks for DistilBERT: {len(valid_track_ids)}")

    # Create genre-to-index mapping
    genre_to_idx = {genre: idx for idx, genre in enumerate(genres_list)}
    logging.info("Genre to index mapping: %s", genre_to_idx)
    print(f"Genre to index mapping: {genre_to_idx}")

    # Prepare audio features
    audio_features_df = pd.DataFrame(audio_features, index=df_metadata['track_id'])
    audio_features = audio_features_df.loc[valid_track_ids].values

    # Prepare texts and labels
    texts = [f"{row['title']} {row['genre']}" for _, row in valid_rows.iterrows()]
    labels = []
    for _, row in valid_rows.iterrows():
        try:
            label = genre_to_idx[row['genre']]
            labels.append(label)
        except KeyError:
            logging.warning("Genre %s not found in genre_to_idx for track %s", row['genre'], row['track_id'])
            continue

    if not labels:
        logging.error("No valid labels generated for DistilBERT classifier")
        print("Error: No valid labels generated for DistilBERT classifier")
        return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}

    # Tokenize texts
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)

    # Create dataset
    dataset_list = [
        {
            'input_ids': inputs['input_ids'][i].numpy(),
            'attention_mask': inputs['attention_mask'][i].numpy(),
            'audio_features': audio_features[i].astype(np.float32),
            'labels': int(labels[i])
        }
        for i in range(len(valid_rows))
    ]

    try:
        dataset = Dataset.from_list(dataset_list)
        dataset = dataset.train_test_split(test_size=0.2, seed=42)
    except Exception as e:
        logging.error("Failed to create dataset for DistilBERT: %s", str(e))
        print(f"Failed to create dataset for DistilBERT: {str(e)}")
        return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}

    def collate_fn(batch):
        return {
            'input_ids': torch.tensor([item['input_ids'] for item in batch], dtype=torch.long).to(DEVICE),
            'attention_mask': torch.tensor([item['attention_mask'] for item in batch], dtype=torch.long).to(DEVICE),
            'audio_features': torch.tensor([item['audio_features'] for item in batch], dtype=torch.float32).to(DEVICE),
            'labels': torch.tensor([item['labels'] for item in batch], dtype=torch.long).to(DEVICE)
        }

    training_args = TrainingArguments(
        output_dir=os.path.join(OUTPUT_DIR, "distilbert-finetuned"),
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        data_collator=collate_fn,
        compute_metrics=lambda pred: {
            'precision': precision_recall_fscore_support(pred.label_ids, pred.predictions.argmax(-1), average='weighted', zero_division=0)[0],
            'recall': precision_recall_fscore_support(pred.label_ids, pred.predictions.argmax(-1), average='weighted', zero_division=0)[1],
            'f1': precision_recall_fscore_support(pred.label_ids, pred.predictions.argmax(-1), average='weighted', zero_division=0)[2]
        }
    )

    try:
        trainer.train()
        eval_results = trainer.evaluate()
        return {
            'precision': eval_results.get('eval_precision', 0.0),
            'recall': eval_results.get('eval_recall', 0.0),
            'f1': eval_results.get('eval_f1', 0.0)
        }
    except Exception as e:
        logging.error("DistilBERT training failed: %s", str(e))
        print(f"DistilBERT training failed: {str(e)}")
        return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}

def music_search(query, df, text_embeddings, audio_features_df, model, scaler, k=10):
    """Content-based music search with cross-modal alignment."""
    query_embedding = model.encode(query, convert_to_tensor=True, device=DEVICE).cpu().numpy()
    query_embedding = query_embedding / np.linalg.norm(query_embedding)
    text_embedding_matrix = np.array([text_embeddings.get(tid, np.zeros(TEXT_EMBEDDING_DIM)) for tid in df['track_id']])
    text_index = faiss.IndexFlatIP(TEXT_EMBEDDING_DIM)
    text_index.add(text_embedding_matrix.astype(np.float32))
    text_scores, text_indices = text_index.search(query_embedding.reshape(1, -1).astype(np.float32), k=20)
    audio_features = audio_features_df[[col for col in audio_features_df.columns if col != 'track_id']].values
    audio_features = scaler.transform(np.nan_to_num(audio_features))
    combined_scores = {}
    for idx, score in zip(text_indices[0], text_scores[0]):
        track_id = df['track_id'].iloc[idx]
        genre = df[df['track_id'] == track_id]['genre'].iloc[0]
        genre_weight = 1.5 if genre.lower() in query.lower() else 1.0
        combined_scores[track_id] = 0.7 * score * genre_weight
        audio_idx = audio_features_df.index[audio_features_df['track_id'] == track_id].tolist()[0]
        genre_tracks = df[df['genre'] == genre]['track_id']
        genre_audio_features = audio_features[audio_features_df['track_id'].isin(genre_tracks)]
        if len(genre_audio_features) > 0:
            avg_genre_audio = np.mean(genre_audio_features, axis=0)
            audio_similarity = 1 - cosine(audio_features[audio_idx], avg_genre_audio)
            combined_scores[track_id] += 0.3 * audio_similarity * genre_weight
    top_tracks = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:k]
    result_ids = [track_id for track_id, _ in top_tracks]
    results = df[df['track_id'].isin(result_ids)][['track_id', 'artist_name', 'title', 'genre']]
    relevant_tracks = df[df['genre'].str.lower().isin(query.lower().split())]['track_id'].values
    y_true = [1 if track_id in relevant_tracks else 0 for track_id in result_ids]
    y_scores = [combined_scores[track_id] for track_id in result_ids]
    precision = sum(y_true) / len(y_true) if len(y_true) > 0 else 0.0
    recall = sum(y_true) / len(relevant_tracks) if len(relevant_tracks) > 0 else 0.0
    map_score = average_precision_score(y_true, y_scores) if sum(y_true) > 0 else 0.0
    ndcg = ndcg_score(np.array(y_true).reshape(1, -1), np.array(y_scores).reshape(1, -1), k=k) if sum(y_true) > 0 else 0.0
    diversity = len(set(results['genre'])) / len(df['genre'].unique()) if len(df['genre'].unique()) > 0 else 1.0
    novelty = len(set(result_ids) - set(df['track_id'].iloc[train_indices])) / len(result_ids) if len(result_ids) > 0 else 1.0
    return results, {'precision@k': precision, 'recall@k': recall, 'map': map_score, 'ndcg@k': ndcg, 'diversity': diversity, 'novelty': novelty}

def streamlit_ui(df_metadata, text_embeddings, audio_features_df, model, scaler):
    """Interactive Streamlit UI for music search and recommendation."""
    st.title("Music Recommender System")
    query = st.text_input("Enter a music query (e.g., 'upbeat rock songs'):", "upbeat rock songs")
    k = st.slider("Number of results:", 1, 20, 10)
    if st.button("Search"):
        results, metrics = music_search(query, df_metadata, text_embeddings, audio_features_df, model, scaler, k)
        st.write("### Search Results")
        st.dataframe(results)
        st.write("### Retrieval Metrics")
        st.json(metrics)
    user_id = st.text_input("Enter user ID for recommendations:", "user_1")
    if st.button("Recommend"):
        user_data = pd.read_csv(USER_DATA_PATH)
        user_ratings = user_data[user_data['user_id'] == user_id]
        if not user_ratings.empty:
            track_ids = user_ratings['track_id'].values
            audio_features = audio_features_df[audio_features_df['track_id'].isin(track_ids)][[col for col in audio_features_df.columns if col != 'track_id']].values
            text_features = np.array([text_embeddings.get(tid, np.zeros(TEXT_EMBEDDING_DIM)) for tid in track_ids])
            recommender = HybridRecommender(audio_dim=audio_features.shape[1], text_dim=TEXT_EMBEDDING_DIM).to(DEVICE)
            recommender.eval()
            with torch.no_grad():
                outputs = recommender(torch.tensor(audio_features, dtype=torch.float32).to(DEVICE), torch.tensor(text_features, dtype=torch.float32).to(DEVICE))
                top_k = torch.topk(outputs, k=k).indices.flatten()
                recommended_ids = track_ids[top_k.cpu()]
                st.write("### Recommended Tracks")
                st.dataframe(df_metadata[df_metadata['track_id'].isin(recommended_ids)][['track_id', 'artist_name', 'title', 'genre']])

def analyze_feature_contribution(model, test_loader, feature_columns):
    """Analyze feature contributions to model predictions."""
    model.eval()
    contributions = {col: [] for col in feature_columns}
    with torch.no_grad():
        for audio_features, text_features, _ in test_loader:
            audio_features, text_features = audio_features.to(DEVICE), text_features.to(DEVICE)
            baseline_output = model(audio_features, text_features)
            for i, col in enumerate(feature_columns):
                modified_features = audio_features.clone()
                modified_features[:, i] = 0
                modified_output = model(modified_features, text_features)
                diff = torch.mean(torch.abs(baseline_output - modified_output)).item()
                contributions[col].append(diff)
    for col in feature_columns:
        print(f"{col}: Mean contribution = {np.mean(contributions[col]):.4f}")
    return contributions

class TestMusicRecommender(unittest.TestCase):
    """Unit tests for music recommender system."""
    def setUp(self):
        self.df = pd.DataFrame({
            'track_id': ['000001', '000002'],
            'artist_name': ['Artist_1', 'Artist_2'],
            'title': ['Track_1', 'Track_2'],
            'genre': ['Rock', 'Pop'],
            'genre_id': [1, 2]
        })
        self.lyrics_dict = {'000001': 'rock anthem roars', '000002': 'pop fever rises'}
        self.audio_features = np.random.randn(2, 26)
        self.text_embeddings = {'000001': np.random.randn(384), '000002': np.random.randn(384)}

    def test_generate_lyrics(self):
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        if tokenizer.pad_token_id is None:
            tokenizer.pad_token_id = tokenizer.eos_token_id
        model = GPT2LMHeadModel.from_pretrained('gpt2')
        track_id = generate_lyrics(('000001', 'Rock', LYRICS_PATH, tokenizer, model))
        self.assertEqual(track_id, '000001')
        self.assertTrue(os.path.exists(os.path.join(LYRICS_PATH, '000001.txt')))

    def test_music_search(self):
        model = SentenceTransformer('all-MiniLM-L6-v2')
        scaler = StandardScaler().fit(self.audio_features)
        df_features = pd.DataFrame(self.audio_features, columns=['track_id'] + [f'feat_{i}' for i in range(26)])
        df_features['track_id'] = self.df['track_id']
        results, metrics = music_search("rock", self.df, self.text_embeddings, df_features, model, scaler)
        self.assertEqual(len(results), 2)
        self.assertIn('precision@k', metrics)

def main():
    """Main function to orchestrate music recommender tasks."""
    logging.info("Starting main execution...")
    try:
        df_metadata, df_features, lyrics_dict = load_fma_data(FMA_AUDIO_DIRS, METADATA_PATH, ARTISTS_PATH, GENRES_PATH, LYRICS_PATH, TAGS_PATH)
    except Exception as e:
        logging.error("Data loading failed: %s", str(e))
        print(f"Data loading failed: {str(e)}")
        return
    if df_metadata.empty or df_features.empty or not lyrics_dict:
        logging.error("Data loading resulted in empty datasets")
        print("Data loading resulted in empty datasets")
        return
    text_embeddings = generate_text_embeddings(lyrics_dict)
    reference_lyrics = {
        tid: f"Sample {row['title']} lyrics in {row['genre']}"
        for tid, row in df_metadata[['track_id', 'title', 'genre']].iterrows()
    }
    bleu_score = evaluate_lyrics_quality(lyrics_dict, reference_lyrics)
    print(f"Average BLEU Score for Lyrics: {bleu_score:.4f}")
    analyze_linguistic_patterns(df_metadata, lyrics_dict)
    valid_track_ids = df_features['track_id'].tolist()
    df_metadata = df_metadata[df_metadata['track_id'].isin(valid_track_ids)]
    text_features = np.array([text_embeddings.get(tid, np.zeros(TEXT_EMBEDDING_DIM)) for tid in df_metadata['track_id']])
    scaler = StandardScaler()
    audio_features = scaler.fit_transform(df_features[[col for col in df_features.columns if col != 'track_id']].values)
    audio_features = np.nan_to_num(audio_features)

    # Load or generate synthetic user ratings
    if os.path.exists(USER_DATA_PATH):
        try:
            user_data = pd.read_csv(USER_DATA_PATH)
            user_data['track_id'] = user_data['track_id'].astype(str).str.zfill(6)
            logging.info("User ratings loaded from %s", USER_DATA_PATH)
            print(f"User ratings loaded from {USER_DATA_PATH}")
        except Exception as e:
            logging.warning("Failed to load ratings.csv: %s. Generating synthetic ratings.", str(e))
            print(f"Failed to load ratings.csv: {str(e)}. Generating synthetic ratings.")
            user_ids = [f"user_{i+1}" for i in range(10)]  # 10 synthetic users
            user_data = pd.DataFrame({
                'user_id': [random.choice(user_ids) for _ in range(len(df_metadata))],
                'track_id': df_metadata['track_id'],
                'rating': np.random.randint(1, 6, size=len(df_metadata))
            })
            os.makedirs(os.path.dirname(USER_DATA_PATH), exist_ok=True)
            user_data.to_csv(USER_DATA_PATH, index=False)
            logging.info("Synthetic ratings saved to %s", USER_DATA_PATH)
            print(f"Synthetic ratings saved to {USER_DATA_PATH}")
    else:
        logging.warning("ratings.csv not found at %s. Generating synthetic ratings.", USER_DATA_PATH)
        print(f"ratings.csv not found at {USER_DATA_PATH}. Generating synthetic ratings.")
        user_ids = [f"user_{i+1}" for i in range(10)]  # 10 synthetic users
        user_data = pd.DataFrame({
            'user_id': [random.choice(user_ids) for _ in range(len(df_metadata))],
            'track_id': df_metadata['track_id'],
            'rating': np.random.randint(1, 6, size=len(df_metadata))
        })
        os.makedirs(os.path.dirname(USER_DATA_PATH), exist_ok=True)
        user_data.to_csv(USER_DATA_PATH, index=False)
        logging.info("Synthetic ratings saved to %s", USER_DATA_PATH)
        print(f"Synthetic ratings saved to {USER_DATA_PATH}")

    user_data = user_data[user_data['track_id'].isin(df_metadata['track_id'])]
    ratings_agg = user_data.groupby('track_id')['rating'].mean().reindex(df_metadata['track_id']).fillna(3.0).values
    ratings = np.clip(ratings_agg / 5.0, 0.0, 1.0)
    X = np.hstack([audio_features, text_features])
    original_indices = np.arange(len(X))
    binary_ratings = (ratings > 0.5).astype(int)
    # Check genre distribution and adjust k_neighbors for SMOTE
    genre_counts = df_metadata['genre'].value_counts()
    print("\nGenre Distribution:")
    for genre, count in genre_counts.items():
        print(f"{genre}: {count} tracks")
    min_samples = min(genre_counts) if not genre_counts.empty else 1
    k_neighbors = min(5, max(1, min_samples - 1))
    logging.info("Using k_neighbors=%d for SMOTE based on minimum class size %d", k_neighbors, min_samples)
    try:
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
        X_resampled, ratings_resampled = smote.fit_resample(X, binary_ratings)
        original_indices_resampled = np.concatenate([original_indices, np.random.choice(original_indices, size=len(X_resampled) - len(X), replace=True)])
    except ValueError as e:
        logging.warning("SMOTE failed: %s. Falling back to original data.", str(e))
        print(f"SMOTE failed: {str(e)}. Using original data.")
        X_resampled, ratings_resampled = X, binary_ratings
        original_indices_resampled = original_indices
    shuffle_indices = np.random.permutation(len(X_resampled))
    X_resampled = X_resampled[shuffle_indices]
    ratings_resampled = ratings_resampled[shuffle_indices]
    original_indices_resampled = original_indices_resampled[shuffle_indices]
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rec_metrics_all = []
    global train_indices
    for fold, (train_idx_resampled, test_idx_resampled) in enumerate(kf.split(X_resampled)):
        print(f"\nFold {fold+1}")
        train_indices = original_indices_resampled[train_idx_resampled]
        X_train_rec, X_test_rec = X_resampled[train_idx_resampled], X_resampled[test_idx_resampled]
        y_train_rec, y_test_rec = ratings_resampled[train_idx_resampled], ratings_resampled[test_idx_resampled]
        train_dataset_rec = torch.utils.data.TensorDataset(
            torch.tensor(X_train_rec[:, :audio_features.shape[1]], dtype=torch.float32),
            torch.tensor(X_train_rec[:, audio_features.shape[1]:], dtype=torch.float32),
            torch.tensor(y_train_rec, dtype=torch.float32)
        )
        train_loader_rec = torch.utils.data.DataLoader(train_dataset_rec, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
        test_dataset_rec = torch.utils.data.TensorDataset(
            torch.tensor(X_test_rec[:, :audio_features.shape[1]], dtype=torch.float32),
            torch.tensor(X_test_rec[:, audio_features.shape[1]:], dtype=torch.float32),
            torch.tensor(y_test_rec, dtype=torch.float32)
        )
        test_loader_rec = torch.utils.data.DataLoader(test_dataset_rec, batch_size=BATCH_SIZE, num_workers=2)
        recommender = HybridRecommender(audio_dim=audio_features.shape[1], text_dim=TEXT_EMBEDDING_DIM).to(DEVICE)
        criterion_rec = nn.BCELoss()
        optimizer_rec = torch.optim.Adam(recommender.parameters(), lr=0.001, weight_decay=1e-4)
        for epoch in range(NUM_EPOCHS_REC):
            loss = train_recommender(recommender, train_loader_rec, criterion_rec, optimizer_rec)
            print(f"Recommendation Epoch {epoch+1}, Loss: {loss:.4f}")
        rec_metrics = evaluate_recommender(recommender, test_loader_rec, df_metadata, test_idx_resampled, original_indices_resampled, df_metadata['genre'].unique())
        rec_metrics_all.append(rec_metrics)
        print(f"Recommendation Metrics: {rec_metrics}")
    avg_rec_metrics = {k: np.mean([m[k] for m in rec_metrics_all]) for k in rec_metrics_all[0]}
    print(f"Average Recommendation Metrics: {avg_rec_metrics}")
    cf_metrics = collaborative_filtering(user_data, df_metadata)
    print(f"Collaborative Filtering Metrics: {cf_metrics}")
    genres = df_metadata['genre'].unique()
    genre_to_idx = {g: i for i, g in enumerate(genres)}
    labels = df_metadata['genre'].map(genre_to_idx).values
    cls_metrics_all = []
    baseline_metrics = []
    for fold, (train_idx_resampled, test_idx_resampled) in enumerate(kf.split(X_resampled)):
        print(f"\nFold {fold+1}")
        X_train_cls, X_test_cls = X_resampled[train_idx_resampled], X_resampled[test_idx_resampled]
        y_train_cls = labels[original_indices_resampled[train_idx_resampled]]
        y_test_cls = labels[original_indices_resampled[test_idx_resampled]]
        train_dataset_cls = torch.utils.data.TensorDataset(
            torch.tensor(X_train_cls[:, :audio_features.shape[1]], dtype=torch.float32),
            torch.tensor(X_train_cls[:, audio_features.shape[1]:], dtype=torch.float32),
            torch.tensor(y_train_cls, dtype=torch.long)
        )
        train_loader_cls = torch.utils.data.DataLoader(train_dataset_cls, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
        test_dataset_cls = torch.utils.data.TensorDataset(
            torch.tensor(X_test_cls[:, :audio_features.shape[1]], dtype=torch.float32),
            torch.tensor(X_test_cls[:, audio_features.shape[1]:], dtype=torch.float32),
            torch.tensor(y_test_cls, dtype=torch.long)
        )
        test_loader_cls = torch.utils.data.DataLoader(test_dataset_cls, batch_size=BATCH_SIZE, num_workers=2)
        classifier = GenreClassifier(
            audio_dim=audio_features.shape[1],
            text_dim=TEXT_EMBEDDING_DIM,
            num_classes=len(genres)
        ).to(DEVICE)
        criterion_cls = nn.CrossEntropyLoss()
        optimizer_cls = torch.optim.Adam(classifier.parameters(), lr=0.001, weight_decay=1e-4)
        for epoch in range(NUM_EPOCHS_CLS):
            loss = train_classifier(classifier, train_loader_cls, criterion_cls, optimizer_cls)
            print(f"Classification Epoch {epoch+1}, Loss: {loss:.4f}")
        cls_metrics = evaluate_classifier(classifier, test_loader_cls, genres)
        cls_metrics_all.append(cls_metrics)
        print(f"Classification Metrics: {cls_metrics}")
        baseline_metrics.append(baseline_classifiers(X_train_cls, y_train_cls, X_test_cls, y_test_cls, genres))
    avg_cls_metrics = {k: np.mean([m[k] for m in cls_metrics_all]) for k in cls_metrics_all[0]}
    print(f"Average Classification Metrics: {avg_cls_metrics}")
    avg_baseline_metrics = {
        model: {k: np.mean([m[model][k] for m in baseline_metrics]) for k in baseline_metrics[0][model]}
        for model in baseline_metrics[0]
    }
    print(f"Baseline Metrics: {avg_baseline_metrics}")
    bert_metrics = bert_classifier(df_metadata, text_embeddings, audio_features, genres)
    print(f"DistilBERT Classifier Metrics: {bert_metrics}")
    search_model = SentenceTransformer('all-MiniLM-L6-v2', device=DEVICE)
    queries = ["upbeat rock songs", "smooth jazz vibes", "energetic pop tracks", "classical orchestral"]
    for query in queries:
        results, search_metrics = music_search(query, df_metadata, text_embeddings, df_features, search_model, scaler)
        print(f"\nSearch Metrics for '{query}': {search_metrics}")
    feature_columns = [col for col in df_features.columns if col != 'track_id']
    analyze_feature_contribution(classifier, test_loader_cls, feature_columns)
    if 'streamlit' in globals():
        streamlit_ui(df_metadata, text_embeddings, df_features, search_model, scaler)
    unittest.main(argv=[''], exit=False)

if __name__ == "__main__":
    main()

Installing faiss-cpu...
Successfully installed faiss-cpu
Installing scikit-learn...
Successfully installed scikit-learn
Mounted at /content/drive
Loading FMA data...

Available columns in tracks.csv:
['Unnamed: 0_level_0_track_id', 'comments_Unnamed: 1_level_1', 'date_created_Unnamed: 2_level_1', 'date_released_Unnamed: 3_level_1', 'engineer_Unnamed: 4_level_1', 'favorites_Unnamed: 5_level_1', 'id_Unnamed: 6_level_1', 'information_Unnamed: 7_level_1', 'listens_Unnamed: 8_level_1', 'producer_Unnamed: 9_level_1', 'tags_Unnamed: 10_level_1', 'title_Unnamed: 11_level_1', 'tracks_Unnamed: 12_level_1', 'type_Unnamed: 13_level_1', 'active_year_begin_Unnamed: 14_level_1', 'active_year_end_Unnamed: 15_level_1', 'associated_labels_Unnamed: 16_level_1', 'bio_Unnamed: 17_level_1', 'comments_Unnamed: 18_level_1', 'date_created_Unnamed: 19_level_1', 'favorites_Unnamed: 20_level_1', 'id_Unnamed: 21_level_1', 'latitude_Unnamed: 22_level_1', 'location_Unnamed: 23_level_1', 'longitude_Unnamed: 24_level_



Genres DataFrame head:
    genre_id    genre_name
0         1       Hip-Hop
1         2           Pop
2         3          Folk
3         4  Experimental
4         5          Rock
Unique genres in metadata: ['Hip-Hop', 'Pop', 'Folk', 'Experimental', 'Rock', 'International', 'Electronic']
Merged metadata with genres successfully.
Data loaded: Metadata (100, 5), Features (100, 27), Lyrics 100


  return forward_call(*args, **kwargs)
Generating text embeddings: 100%|██████████| 100/100 [00:05<00:00, 17.16it/s]


Average BLEU Score for Lyrics: 0.0000

Linguistic Analysis:
Hip-Hop top words: [('lil', 6), ('wayne', 6), ('lyrics', 5), ('kanye', 5), ('west', 4)]
Pop top words: [('song', 16), ('pop', 13), ('lyrics', 11), ('songs', 11), ('use', 10)]
Folk top words: [('song', 25), ('lyrics', 23), ('folk', 23), ('songs', 23), ('music', 22)]
Experimental top words: [('lyrics', 15), ('song', 14), ('music', 13), ('sound', 11), ('use', 10)]
Rock top words: [('rock', 23), ('lyrics', 18), ('song', 17), ('band', 11), ('use', 11)]
International top words: [('use', 15), ('lyrics', 12), ('song', 12), ('like', 9), ('share', 6)]
Electronic top words: [('future', 2), ('aware', 1), ('lyrics', 1), ('emotions', 1), ('lot', 1)]
Topic 0: ['word', 'use', 'don', 'create', 'song']
Topic 1: ['music', 'use', 'songs', 'lyrics', 'song']
Topic 2: ['ll', 'music', 'song', 'use', 'lyrics']
Topic 3: ['new', 'released', 'band', 'sound', 'album']
Topic 4: ['songs', 'folk', 'use', 'lyrics', 'music']
User ratings loaded from /content/f

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of valid tracks for DistilBERT: 100
Genre to index mapping: {'Hip-Hop': 0, 'Pop': 1, 'Folk': 2, 'Experimental': 3, 'Rock': 4, 'International': 5, 'Electronic': 6}


ERROR:root:DistilBERT training failed: 'labels'


DistilBERT training failed: 'labels'
DistilBERT Classifier Metrics: {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)



Search Metrics for 'upbeat rock songs': {'precision@k': 0.7, 'recall@k': 0.4117647058823529, 'map': np.float64(0.9571428571428571), 'ndcg@k': np.float64(0.9878316351280036), 'diversity': 0.5714285714285714, 'novelty': 0.0}

Search Metrics for 'smooth jazz vibes': {'precision@k': 0.0, 'recall@k': 0.0, 'map': 0.0, 'ndcg@k': 0.0, 'diversity': 0.7142857142857143, 'novelty': 0.1}

Search Metrics for 'energetic pop tracks': {'precision@k': 0.5, 'recall@k': 0.5, 'map': np.float64(1.0), 'ndcg@k': np.float64(1.0), 'diversity': 0.5714285714285714, 'novelty': 0.1}

Search Metrics for 'classical orchestral': {'precision@k': 0.0, 'recall@k': 0.0, 'map': 0.0, 'ndcg@k': 0.0, 'diversity': 0.5714285714285714, 'novelty': 0.0}
mfcc_1: Mean contribution = 0.0066
mfcc_2: Mean contribution = 0.0097
mfcc_3: Mean contribution = 0.0057
mfcc_4: Mean contribution = 0.0021
mfcc_5: Mean contribution = 0.0026
mfcc_6: Mean contribution = 0.0075
mfcc_7: Mean contribution = 0.0018
mfcc_8: Mean contribution = 0.0116
m

.E
ERROR: test_music_search (__main__.TestMusicRecommender.test_music_search)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipython-input-2034136229.py", line 930, in test_music_search
    df_features = pd.DataFrame(self.audio_features, columns=['track_id'] + [f'feat_{i}' for i in range(26)])
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pandas/core/frame.py", line 827, in __init__
    mgr = ndarray_to_mgr(
          ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pandas/core/internals/construction.py", line 336, in ndarray_to_mgr
    _check_values_indices_shape_match(values, index, columns)
  File "/usr/local/lib/python3.11/dist-packages/pandas/core/internals/construction.py", line 420, in _check_values_indices_shape_match
    raise ValueError(f"Shape of passed values is {passed}, indices