# Reduction of dataset to be around n (500) occurrence if possible.

In [1]:
import pandas as pd

## Single tag occurrence vs all

In [2]:
data = pd.read_csv('../../datasets/jamendo/metadata/audio_dataset_14_genres.csv')

# Create a new column that counts the number of genres in each row
data['genre_count'] = data['TAGS'].apply(lambda x: len(x.split(';')))

# Count the occurrences based on the number of genres (1 for single, more than 1 for multiple)
genre_count_summary = data['genre_count'].value_counts().sort_index()

# Separate the counts into single-genre and multiple-genre categories for clarity
single_genre_count = genre_count_summary.get(1, 0)  # Tracks with exactly one genre
multiple_genre_count = genre_count_summary[genre_count_summary.index > 1].sum()  # Tracks with multiple genres

# Prepare results
result = {
    "Single Genre Count": single_genre_count,
    "Multiple Genre Count": multiple_genre_count
}

result

{'Single Genre Count': 30169, 'Multiple Genre Count': 16839}

In [3]:
# For a breakdown by genre, distinguishing single-genre vs multiple-genre occurrences

# First, split the genres and explode to have one genre per row for accurate counting
data_expanded = data.copy()
data_expanded['TAGS'] = data_expanded['TAGS'].str.split(';')
data_expanded = data_expanded.explode('TAGS')

# Add a column to indicate if the original row was a single or multiple genre
data_expanded['genre_type'] = data_expanded['genre_count'].apply(lambda x: 'Single' if x == 1 else 'Multiple')

# Now count occurrences by genre and genre type
genre_breakdown = data_expanded.groupby(['TAGS', 'genre_type']).size().unstack(fill_value=0)

# Display the result for user
genre_breakdown

genre_type,Multiple,Single
TAGS,Unnamed: 1_level_1,Unnamed: 2_level_1
Ambient,4314,3256
Blues,798,347
Elektronika,8848,9062
Funk,1095,300
Hip-Hop,1143,1514
House,1524,645
Jazz,1590,1207
Klasyczna,1843,5345
Latin,357,347
Metal,798,637


In [4]:
# Expand the TAGS column by separating multi-genre entries
data_genre_expanded = data.copy()
data_genre_expanded['TAGS'] = data_genre_expanded['TAGS'].str.split(';')
data_genre_expanded = data_genre_expanded.explode('TAGS')

# Now, calculate the average duration per genre
average_duration_per_genre = data_genre_expanded.groupby('TAGS')['DURATION'].mean()

average_duration_per_genre

TAGS
Ambient        293.050462
Blues          221.648559
Elektronika    269.691971
Funk           228.744158
Hip-Hop        198.318705
House          273.232319
Jazz           255.151662
Klasyczna      215.848511
Latin          205.225568
Metal          248.211220
Pop            222.198187
Reggae         232.878956
Rock           229.571112
Techno         291.133915
Name: DURATION, dtype: float64

## Dataset reduction - Save reduced metadata file

In [5]:
import pandas as pd
import random

# Read the CSV file
df = pd.read_csv('../../datasets/jamendo/metadata/audio_dataset_14_genres.csv')

# Split the 'TAGS' column into a list of genres per row
df['genre_list'] = df['TAGS'].str.split(';')

# Count the number of genres per row
df['genre_count'] = df['genre_list'].apply(len)

# Create a set of all genres
genres = set(df['genre_list'].explode())

# Initialize an empty set for selected song indices
selected_indices = set()

# Initialize a dictionary to store the counts per genre
genre_counts = {genre: {'single': 0, 'multi': 0} for genre in genres}

# Set desired counts per genre
desired_per_genre_single = 250
desired_per_genre_multi = 250

for genre in genres:
    # Get songs that have not been selected yet and contain the genre
    genre_songs = df[~df.index.isin(selected_indices) & df['genre_list'].apply(lambda x: genre in x)]
    
    # Separate single-tag and multi-tag songs
    single_tag_songs = genre_songs[genre_songs['genre_count'] == 1]
    multi_tag_songs = genre_songs[genre_songs['genre_count'] > 1]
    
    # Sample up to 500 single-tag songs
    num_single_available = len(single_tag_songs)
    num_single_to_sample = min(num_single_available, desired_per_genre_single)
    if num_single_to_sample > 0:
        sampled_single = single_tag_songs.sample(n=num_single_to_sample, random_state=42)
        sampled_single_indices = sampled_single.index
        selected_indices.update(sampled_single_indices)
        genre_counts[genre]['single'] = num_single_to_sample
    else:
        sampled_single_indices = []
    
    # Sample up to 500 multi-tag songs
    num_multi_available = len(multi_tag_songs)
    num_multi_to_sample = min(num_multi_available, desired_per_genre_multi)
    if num_multi_to_sample > 0:
        sampled_multi = multi_tag_songs.sample(n=num_multi_to_sample, random_state=42)
        sampled_multi_indices = sampled_multi.index
        selected_indices.update(sampled_multi_indices)
        genre_counts[genre]['multi'] = num_multi_to_sample
    else:
        sampled_multi_indices = []
    
    # Debugging output per genre
    total_samples = num_single_to_sample + num_multi_to_sample
    print(f"Genre: {genre}")
    print(f"  Single-tag samples: {num_single_to_sample}/{desired_per_genre_single}")
    print(f"  Multi-tag samples: {num_multi_to_sample}/{desired_per_genre_multi}")
    print(f"  Total samples for genre: {total_samples}")
    print("-" * 40)

# Create the final sampled dataset
df_sampled = df.loc[list(selected_indices)]

# Save the reduced and stratified dataset
df_sampled.to_csv('../../datasets/jamendo/metadata/reduced_audio_dataset_14_genres.csv', index=False)

print(f"Sampled dataset created with {len(df_sampled)} unique songs.")

Genre: House
  Single-tag samples: 250/250
  Multi-tag samples: 250/250
  Total samples for genre: 500
----------------------------------------
Genre: Rock
  Single-tag samples: 250/250
  Multi-tag samples: 250/250
  Total samples for genre: 500
----------------------------------------
Genre: Elektronika
  Single-tag samples: 250/250
  Multi-tag samples: 250/250
  Total samples for genre: 500
----------------------------------------
Genre: Pop
  Single-tag samples: 250/250
  Multi-tag samples: 250/250
  Total samples for genre: 500
----------------------------------------
Genre: Hip-Hop
  Single-tag samples: 250/250
  Multi-tag samples: 250/250
  Total samples for genre: 500
----------------------------------------
Genre: Techno
  Single-tag samples: 0/250
  Multi-tag samples: 250/250
  Total samples for genre: 250
----------------------------------------
Genre: Reggae
  Single-tag samples: 250/250
  Multi-tag samples: 250/250
  Total samples for genre: 500
----------------------------

In [6]:
data = pd.read_csv('../../datasets/jamendo/metadata/reduced_audio_dataset_14_genres.csv')

# Create a new column that counts the number of genres in each row
data['genre_count'] = data['TAGS'].apply(lambda x: len(x.split(';')))

# Count the occurrences based on the number of genres (1 for single, more than 1 for multiple)
genre_count_summary = data['genre_count'].value_counts().sort_index()

# Separate the counts into single-genre and multiple-genre categories for clarity
single_genre_count = genre_count_summary.get(1, 0)  # Tracks with exactly one genre
multiple_genre_count = genre_count_summary[genre_count_summary.index > 1].sum()  # Tracks with multiple genres

# Prepare results
result = {
    "Single Genre Count": single_genre_count,
    "Multiple Genre Count": multiple_genre_count
}

result

{'Single Genre Count': 3250, 'Multiple Genre Count': 3500}

In [7]:
# Expand the TAGS column by separating multi-genre entries
data_genre_expanded = data.copy()
data_genre_expanded['TAGS'] = data_genre_expanded['TAGS'].str.split(';')
data_genre_expanded = data_genre_expanded.explode('TAGS')

# Now, calculate the average duration per genre
average_duration_per_genre = data_genre_expanded.groupby('TAGS')['DURATION'].mean()

average_duration_per_genre

TAGS
Ambient        288.773966
Blues          220.666609
Elektronika    265.684827
Funk           226.634473
Hip-Hop        194.379613
House          271.742072
Jazz           243.958142
Klasyczna      255.262242
Latin          209.716390
Metal          248.561871
Pop            223.896646
Reggae         229.313321
Rock           236.689291
Techno         293.077184
Name: DURATION, dtype: float64

## Copying remained songs into new directory

In [8]:
import pandas as pd
import os
import shutil

# Read the metadata CSV file
data = pd.read_csv('../../datasets/jamendo/metadata/reduced_audio_dataset_14_genres.csv')

# Original directory containing the folders '00' to '99'
original_directory = '../../datasets/jamendo/original_audio/'

# New directory to copy the files into
new_directory = '../../datasets/jamendo/reduced_audio_500/'

# Create the new directory if it doesn't exist
if not os.path.exists(new_directory):
    os.makedirs(new_directory)

# Iterate over each row in the DataFrame
for index, row in data.iterrows():
    # Get the PATH column, which has the relative path to the .mp3 file
    relative_path = row['PATH']  # e.g., '14/214.mp3'

    # Construct the full source path
    source_path = os.path.join(original_directory, relative_path)

    # Construct the destination path in the new directory
    dest_path = os.path.join(new_directory, relative_path)

    # Ensure the destination subdirectory exists
    dest_subdir = os.path.dirname(dest_path)
    if not os.path.exists(dest_subdir):
        os.makedirs(dest_subdir)

    # Copy the file if it exists
    if os.path.exists(source_path):
        shutil.copy2(source_path, dest_path)
    else:
        print(f"File not found: {source_path}")