# Reduction of dataset to be around n (500) occurrence if possible.

In [1]:
import pandas as pd

## Single tag occurrence vs all

In [2]:
data = pd.read_csv('../../datasets/jamendo/metadata/audio_dataset_14_genres.csv')

# Create a new column that counts the number of genres in each row
data['genre_count'] = data['TAGS'].apply(lambda x: len(x.split(';')))

# Count the occurrences based on the number of genres (1 for single, more than 1 for multiple)
genre_count_summary = data['genre_count'].value_counts().sort_index()

# Separate the counts into single-genre and multiple-genre categories for clarity
single_genre_count = genre_count_summary.get(1, 0)  # Tracks with exactly one genre
multiple_genre_count = genre_count_summary[genre_count_summary.index > 1].sum()  # Tracks with multiple genres

# Prepare results
result = {
    "Single Genre Count": single_genre_count,
    "Multiple Genre Count": multiple_genre_count
}

result

{'Single Genre Count': np.int64(30169),
 'Multiple Genre Count': np.int64(16839)}

In [5]:
# For a breakdown by genre, distinguishing single-genre vs multiple-genre occurrences

# First, split the genres and explode to have one genre per row for accurate counting
data_expanded = data.copy()
data_expanded['TAGS'] = data_expanded['TAGS'].str.split(';')
data_expanded = data_expanded.explode('TAGS')

# Add a column to indicate if the original row was a single or multiple genre
data_expanded['genre_type'] = data_expanded['genre_count'].apply(lambda x: 'Single' if x == 1 else 'Multiple')

# Now count occurrences by genre and genre type
genre_breakdown = data_expanded.groupby(['TAGS', 'genre_type']).size().unstack(fill_value=0)

# Display the result for user
genre_breakdown

genre_type,Multiple,Single
TAGS,Unnamed: 1_level_1,Unnamed: 2_level_1
Ambient,4314,3256
Blues,798,347
Elektronika,8848,9062
Funk,1095,300
Hip-Hop,1143,1514
House,1524,645
Jazz,1590,1207
Klasyczna,1843,5345
Latin,357,347
Metal,798,637


## Average time per genre

In [6]:
# Expand the TAGS column by separating multi-genre entries
data_genre_expanded = data.copy()
data_genre_expanded['TAGS'] = data_genre_expanded['TAGS'].str.split(';')
data_genre_expanded = data_genre_expanded.explode('TAGS')

# Now, calculate the average duration per genre
average_duration_per_genre = data_genre_expanded.groupby('TAGS')['DURATION'].mean()

average_duration_per_genre

TAGS
Ambient        293.050462
Blues          221.648559
Elektronika    269.691971
Funk           228.744158
Hip-Hop        198.318705
House          273.232319
Jazz           255.151662
Klasyczna      215.848511
Latin          205.225568
Metal          248.211220
Pop            222.198187
Reggae         232.878956
Rock           229.571112
Techno         291.133915
Name: DURATION, dtype: float64

## Dataset reduction - Save reduced metadata file

In [14]:
import pandas as pd
import numpy as np

# 1. Wczytanie danych
df = pd.read_csv('../../datasets/jamendo/metadata/audio_dataset_15_genres.csv')

# Tworzymy kolumnę 'genre_list' (jeśli jej nie masz)
if 'genre_list' not in df.columns:
    df['genre_list'] = df['TAGS'].str.split(';')

# Zbiór wszystkich etykiet
all_genres = set()
for glist in df['genre_list']:
    all_genres.update(glist)

# 2. Definiujemy limity
label_limits = {g: 500 for g in all_genres}  # domyślnie 500
label_limits['Techno'] = 200                 # Techno do 200
label_limits['Elektronika'] = 700            # Elektronika do 700

# 3. Tasujemy całe dane, ale utwory z Techno dajemy najpierw
#    - najpierw tworzymy dwie grupy: "zawiera Techno" i "nie zawiera Techno"
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
df_techno = df_shuffled[df_shuffled['genre_list'].apply(lambda x: 'Techno' in x)]
df_non_techno = df_shuffled[~df_shuffled.index.isin(df_techno.index)]

# 4. Łączymy, tak żeby Techno-labeled pojawiły się na początku
df_prioritized = pd.concat([df_techno, df_non_techno], ignore_index=True)

# 5. Pętla zbierająca piosenki z ograniczeniami multi-label
genre_counts = {g: 0 for g in all_genres}
selected_rows = []

for _, row in df_prioritized.iterrows():
    track_genres = row['genre_list']
    can_add = True
    for g in track_genres:
        # Sprawdź, czy dany gatunek w tym utworze nie jest już "zapełniony"
        if genre_counts[g] >= label_limits[g]:
            can_add = False
            break

    if can_add:
        selected_rows.append(row)
        # Zwiększ liczniki we wszystkich gatunkach tego utworu
        for g in track_genres:
            genre_counts[g] += 1

df_sampled = pd.DataFrame(selected_rows)

# 6. Zapis i statystyki
output_path = '../../datasets/jamendo/metadata/reduced_audio_dataset_15_genres.csv'
df_sampled.to_csv(output_path, index=False)

print(f"Zapisano {len(df_sampled)} utworów do pliku: {output_path}")
print("\nOstateczne statystyki (liczba przydzielonych piosenek):")
for g in sorted(label_limits.keys()):
    c = sum(df_sampled['genre_list'].apply(lambda x: g in x))
    print(f"  {g}: {c} (limit: {label_limits[g]})")


Zapisano 5561 utworów do pliku: ../../datasets/jamendo/metadata/reduced_audio_dataset_15_genres.csv

Ostateczne statystyki (liczba przydzielonych piosenek):
  Ambient: 500 (limit: 500)
  Blues: 394 (limit: 500)
  Elektronika: 700 (limit: 700)
  Folk: 500 (limit: 500)
  Funk: 475 (limit: 500)
  Hip-Hop: 500 (limit: 500)
  House: 500 (limit: 500)
  Jazz: 500 (limit: 500)
  Klasyczna: 500 (limit: 500)
  Latin: 289 (limit: 500)
  Metal: 500 (limit: 500)
  Pop: 500 (limit: 500)
  Reggae: 500 (limit: 500)
  Rock: 500 (limit: 500)
  Techno: 200 (limit: 200)


In [15]:
data = pd.read_csv('../../datasets/jamendo/metadata/reduced_audio_dataset_15_genres.csv')

# Create a new column that counts the number of genres in each row
data['genre_count'] = data['TAGS'].apply(lambda x: len(x.split(';')))

# Count the occurrences based on the number of genres (1 for single, more than 1 for multiple)
genre_count_summary = data['genre_count'].value_counts().sort_index()

# Separate the counts into single-genre and multiple-genre categories for clarity
single_genre_count = genre_count_summary.get(1, 0)  # Tracks with exactly one genre
multiple_genre_count = genre_count_summary[genre_count_summary.index > 1].sum()  # Tracks with multiple genres

# Prepare results
result = {
    "Single Genre Count": single_genre_count,
    "Multiple Genre Count": multiple_genre_count
}

result

{'Single Genre Count': np.int64(4305), 'Multiple Genre Count': np.int64(1256)}

In [16]:
# Expand the TAGS column by separating multi-genre entries
data_genre_expanded = data.copy()
data_genre_expanded['TAGS'] = data_genre_expanded['TAGS'].str.split(';')
data_genre_expanded = data_genre_expanded.explode('TAGS')

# Now, calculate the average duration per genre
average_duration_per_genre = data_genre_expanded.groupby('TAGS')['DURATION'].mean()

average_duration_per_genre

TAGS
Ambient        278.741400
Blues          211.855076
Elektronika    271.900000
Folk           219.570600
Funk           223.587789
Hip-Hop        195.770400
House          258.054200
Jazz           249.357600
Klasyczna      224.215200
Latin          204.001384
Metal          246.779200
Pop            233.027200
Reggae         234.762000
Rock           230.097600
Techno         288.566000
Name: DURATION, dtype: float64

## Copying remained songs into new directory - not necessary, I can use metadata file.

In [None]:
# import pandas as pd
# import os
# import shutil

# # Read the metadata CSV file
# data = pd.read_csv('../../datasets/jamendo/metadata/reduced_audio_dataset_15_genres.csv')

# # Original directory containing the folders '00' to '99'
# original_directory = '../../datasets/jamendo/original_audio/'

# # New directory to copy the files into
# new_directory = '../../datasets/jamendo/reduced_audio_500/'

# # Create the new directory if it doesn't exist
# if not os.path.exists(new_directory):
#     os.makedirs(new_directory)

# # Iterate over each row in the DataFrame
# for index, row in data.iterrows():
#     # Get the PATH column, which has the relative path to the .mp3 file
#     relative_path = row['PATH']  # e.g., '14/214.mp3'

#     # Construct the full source path
#     source_path = os.path.join(original_directory, relative_path)

#     # Construct the destination path in the new directory
#     dest_path = os.path.join(new_directory, relative_path)

#     # Ensure the destination subdirectory exists
#     dest_subdir = os.path.dirname(dest_path)
#     if not os.path.exists(dest_subdir):
#         os.makedirs(dest_subdir)

#     # Copy the file if it exists
#     if os.path.exists(source_path):
#         shutil.copy2(source_path, dest_path)
#     else:
#         print(f"File not found: {source_path}")