In [1]:
import pandas as pd
import numpy as np
import shutil
import os

# Load metadata with multi-level headers
metadata_path = 'fma_metadata/tracks.csv'
tracks = pd.read_csv(metadata_path, index_col=0, header=[0, 1])

# Display columns to understand the structure
print(tracks.columns)

# Extract the relevant column
genre_column = ('track', 'genre_top')

# Define the desired genres
desired_genres = ['Pop', 'Rock', 'Hip-Hop', 'Electronic']

# Filter tracks to include only those with the desired genres
tracks[genre_column] = tracks[genre_column].fillna('Unknown')  # Handle missing values
filtered_tracks = tracks[tracks[genre_column].isin(desired_genres)]

# Extract all track IDs from the filtered metadata
all_metadata_ids = filtered_tracks.index.values

# Define the path to the directory containing the actual dataset
dataset_dir = 'fma_small/'

# Get all track IDs from the directory
available_ids = set()
for subdir in os.listdir(dataset_dir):
    subdir_path = os.path.join(dataset_dir, subdir)
    if os.path.isdir(subdir_path):
        for file_name in os.listdir(subdir_path):
            if file_name.endswith('.mp3'):
                track_id = file_name.split('.')[0]
                available_ids.add(int(track_id))

# Print the number of available IDs
print(f"Number of available track IDs: {len(available_ids)}")

# Filter metadata track IDs based on available IDs
valid_ids = [track_id for track_id in all_metadata_ids if track_id in available_ids]

# Print the number of valid IDs
print(f"Number of valid track IDs in dataset: {len(valid_ids)}")

# Randomly select 580 track IDs from the valid IDs
np.random.seed(42)  # For reproducibility
sampled_ids = np.random.choice(valid_ids, size=min(580, len(valid_ids)), replace=False)
print(f"Sampled Track IDs: {sampled_ids}")

# Filter the metadata for the sampled IDs
filtered_tracks = tracks.loc[sampled_ids]

# Define the range of directories
source_dirs = [f'fma_small/{str(i).zfill(3)}/' for i in range(156)]  # from 000 to 155
destination_dir = 'fma_small_filtered/'

# Create destination directory if it doesn't exist
os.makedirs(destination_dir, exist_ok=True)

# Copy selected audio files to the new directory
for track_id in sampled_ids:
    # Try to find the track in each directory
    found = False
    for source_dir in source_dirs:
        track_path = os.path.join(source_dir, f'{track_id:06d}.mp3')
        if os.path.exists(track_path):
            shutil.copy(track_path, destination_dir)
            found = True
            break  # Exit loop once the file is found and copied
    if not found:
        print(f"File not found for track ID: {track_id:06d}")

print("File copying completed.")


MultiIndex([( 'album',          'comments'),
            ( 'album',      'date_created'),
            ( 'album',     'date_released'),
            ( 'album',          'engineer'),
            ( 'album',         'favorites'),
            ( 'album',                'id'),
            ( 'album',       'information'),
            ( 'album',           'listens'),
            ( 'album',          'producer'),
            ( 'album',              'tags'),
            ( 'album',             'title'),
            ( 'album',            'tracks'),
            ( 'album',              'type'),
            ('artist', 'active_year_begin'),
            ('artist',   'active_year_end'),
            ('artist', 'associated_labels'),
            ('artist',               'bio'),
            ('artist',          'comments'),
            ('artist',      'date_created'),
            ('artist',         'favorites'),
            ('artist',                'id'),
            ('artist',          'latitude'),
          