In [30]:
import pandas as pd
import csv
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import os
from collections import Counter
import h5py
import requests
import reverse_geocoder as rg
from utils.location_processor import LocationProcessor 
import os


In [191]:
msdi_df = pd.read_csv(f"input/MSDI_dataset/msdi_mapping_and_metadata.csv",  header=0)
mumu_df = pd.read_csv(f"input/MuMu_dataset/mumu_mapping_msdi.csv",  header=0)

msdi_df['name_image_msdi'] = msdi_df['name_image']
msdi_df.drop(columns=['name_image','artist_longitude','artist_latitude','song_id'], inplace=True)
mumu_df.drop(columns=['artist_latitude','artist_longitude','track_id','MSD_track_id_si','amazon_id_si'], inplace=True)


In [215]:
merged_df = pd.merge(msdi_df, mumu_df, left_on='track_id', right_on='MSD_track_id', how='outer')
merged_df.drop(columns=['album_index','set','song_id'], inplace=True)

merged_df['m_id'] = np.where(merged_df['MSD_track_id'].notna(), merged_df['MSD_track_id'], merged_df['track_id'])
merged_df['m_name_image'] = np.where(merged_df['name_image'].notna(), merged_df['name_image'], merged_df['name_image_msdi'])
merged_df['m_url'] = np.where(merged_df['imUrl'].notna(), merged_df['imUrl'], merged_df['image_url'])
merged_df['m_release'] = np.where(merged_df['release_y'].notna(), merged_df['release_y'], merged_df['release_x'])
merged_df['m_title'] = np.where(merged_df['title_y'].notna(), merged_df['title_y'], merged_df['title_x'])
merged_df['m_year'] = np.where(merged_df['year_y'].notna(), merged_df['year_y'], merged_df['year_x'])
merged_df['m_single_genre'] = np.where(merged_df['genre'].notna(), merged_df['genre'], merged_df['genres_si'])
merged_df['m_artist_id'] = np.where(merged_df['artist_id_y'].notna(), merged_df['artist_id_y'], merged_df['artist_id_x'])
merged_df['m_artist_location'] = np.where(merged_df['artist_location_y'].notna(), merged_df['artist_location_y'], merged_df['artist_location_x'])
merged_df['m_country'] = np.where(merged_df['country_y'].notna(), merged_df['country_y'], merged_df['country_x'])
merged_df['m_country_name'] = np.where(merged_df['country_name_y'].notna(), merged_df['country_name_y'], merged_df['country_name_x'])
merged_df['m_artist_location_country'] = np.where(merged_df['artist_location_country_y'].notna(), merged_df['artist_location_country_y'], merged_df['artist_location_country_x'])
merged_df['m_artist_location_country'] = np.where(merged_df['artist_location_country_y'].notna(), merged_df['artist_location_country_y'], merged_df['artist_location_country_x'])
merged_df['m_artist_name'] = np.where(merged_df['artist_name_y'].notna(), merged_df['artist_name_y'], merged_df['artist_name_x'])

merged_df.drop(columns=['MSD_track_id','track_id','msd_track_id','name_image','name_image_msdi',
                        'artist_location_country_y','artist_location_country_x',
                        'year_y','year_x',
                        'country_name_y','country_name_x',
                        'artist_name_y','artist_name_x',
                        'artist_location_y','artist_location_x',
                        'country_y','country_x',
                        'release_y','release_x',
                        'title_y','title_x',
                        'imUrl','image_url',
                        'artist_id_y','msd_artist_id','artist_id_x',
                        'genres_si','genre',
                       ], inplace=True)


In [None]:
other_columns = [col for col in merged_df.columns if not col.startswith('genre_')]
genre_columns = [col for col in merged_df.columns if col.startswith('genre_')]

merged_df[genre_columns] = merged_df[genre_columns].fillna(0.0)
merged_df['genres_list'].fillna(1, inplace=True)

new_column_order = other_columns + genre_columns

df_metadata = merged_df[other_columns]
df_full= merged_df[new_column_order]

df_metadata.to_csv(f'input/merged_dataset/merged_metadata_not_pivoted.csv', index=False)
df_full.to_csv(f'input/merged_dataset/merged_metadata_pivoted.csv', index=False)

## Stats

In [242]:
single_label_genre = df_full['m_single_genre'].value_counts()
single_label_genre_df = single_label_genre.reset_index()
single_label_genre_df['Percentage'] = round((single_label_genre_df['count'] * 100) / sum(single_label_genre_df['count']), 2)
single_label_genre_df

Unnamed: 0,m_single_genre,count,Percentage
0,Rock,4264,22.53
1,Electronic,2544,13.44
2,Pop,1967,10.39
3,Jazz,1591,8.41
4,Metal,1252,6.61
5,Rap,1232,6.51
6,RnB,970,5.12
7,Country,943,4.98
8,Reggae,845,4.46
9,Folk,572,3.02


Now we analyse the multi label classification for this, we count how many genres are defined in the dataset

In [287]:
genre_counts = {col: df_full[col].sum() for col in df_full.columns if col.startswith('genre_')}

print(f"Number of genres individually {len(genre_counts)}")

Number of genres individually 446


Then we calculate how many images corresponds to which genres, the percentage here doesn't sum one because is over the total images, and an image can have multiple genres. Most of the images are classified as Pop (59.591%) followed by Rock (44.467%)

In [316]:
unique_genres = set('genre_' + df_full['m_single_genre'].dropna().unique())
unique_genres.update([col for col in df_full.columns if col.startswith('genre_')])

genre_counts = {}

# Loop through each unique genre
for genre in unique_genres:
    genre_column = genre.replace('genre_', '')
    
    genre_count_m_single = df_full['m_single_genre'].str.contains(genre_column).sum()
    
    genre_count_one_hot = df_full[genre].sum() if genre in df_full.columns else 0
    
    genre_counts[genre_column] = genre_count_m_single + genre_count_one_hot

genre_counts_df = pd.DataFrame(list(genre_counts.items()), columns=['Genre', 'Count']).sort_values('Count', ascending=False)



  genre_count_m_single = df_full['m_single_genre'].str.contains(genre_column).sum()


In [318]:
genre_counts_df
genre_counts_df_sorted = genre_counts_df.sort_values('Count', ascending=False)
genre_counts_df_sorted.reset_index(drop=True, inplace=True)
genre_counts_df_sorted['Percentage'] = round((genre_counts_df_sorted['Count'] * 100) / len(df_full), 3)
genre_counts_df_sorted

Unnamed: 0,Genre,Count,Percentage
0,Pop,27915.0,59.591
1,Rock,20830.0,44.467
2,Alternative Rock,8398.0,17.928
3,World Music,6333.0,13.519
4,Jazz,6077.0,12.973
...,...,...,...
445,Tangos,1.0,0.002
446,Ukraine,1.0,0.002
447,Tierra Caliente,1.0,0.002
448,Polynesian Music,1.0,0.002


Now we check how many labels a image have and check the distribution of this, most albums are classified with 4, 5 and 6 genres, this can help to discard albums with more that a certain number of genres

In [321]:
number_of_multi_defined_genres = df_full['genres_list'].value_counts()
number_of_multi_defined_genres_df = number_of_multi_defined_genres.reset_index()
number_of_multi_defined_genres_df['Percentage'] = round((number_of_multi_defined_genres_df['count'] * 100) / len(df_full), 3)
number_of_multi_defined_genres_df

Unnamed: 0,genres_list,count,Percentage
0,1.0,16705,35.661
1,4.0,4267,9.109
2,5.0,4232,9.034
3,6.0,4133,8.823
4,7.0,3577,7.636
5,3.0,3064,6.541
6,8.0,2756,5.883
7,2.0,2078,4.436
8,9.0,1971,4.208
9,10.0,1497,3.196


In [324]:
year_counts = df_full['m_year'].value_counts()
year_counts_df = year_counts.reset_index()
year_counts_df['Percentage'] = round((year_counts_df['count'] * 100) / len(df_full), 3)
year_counts_df

Unnamed: 0,m_year,count,Percentage
0,0.0,9609,20.513
1,2007.0,2290,4.889
2,2006.0,2269,4.844
3,2005.0,2253,4.810
4,2003.0,1929,4.118
...,...,...,...
77,1938.0,1,0.002
78,1922.0,1,0.002
79,1924.0,1,0.002
80,1933.0,1,0.002


In [326]:
artirst_counts = df_full['m_artist_id'].value_counts()
artirst_counts_df = artirst_counts.reset_index()
artirst_counts_df['Percentage'] = round((artirst_counts_df['count'] * 100) / len(df_full), 3)
artirst_counts_df = pd.merge(left=artirst_counts_df, right=df_full, left_on='m_artist_id', right_on='m_artist_id', how='left').drop_duplicates(subset='m_artist_id')
artirst_counts_df= artirst_counts_df[['m_artist_id','m_artist_name','count','Percentage']]
artirst_counts_df

Unnamed: 0,m_artist_id,m_artist_name,count,Percentage
0,ARMM6WZ1187FB4958D,Willie Nelson / Toots Hibbert,38,0.081
38,ARH861H1187B9B799E,Johnny Cash,30,0.064
68,AR6Q4T91187B995616,Sonny Rollins,27,0.058
95,ARPGS671187B996060,McCoy Tyner,27,0.058
122,AR9I1L41187FB37F0E,Harry Connick_ Jr.,27,0.058
...,...,...,...,...
46839,ARFXV9Q1187B9A7010,T-Spoon,1,0.002
46840,ARF3XJD1187FB38DE0,Cristy Lane,1,0.002
46841,AR76EV61187FB541DD,Ewigkeit,1,0.002
46842,AR8AR6Q1187FB4A64F,El Perro Del Mar,1,0.002


In [327]:
country_counts = df_full['m_artist_location_country'].value_counts()
country_counts_df = country_counts.reset_index()
country_counts_df['Percentage'] = round((country_counts_df['count'] * 100) / len(df_full), 3)
country_counts_df

Unnamed: 0,m_artist_location_country,count,Percentage
0,United States,15004,32.030
1,United Kingdom,4339,9.263
2,Canada,1753,3.742
3,Germany,691,1.475
4,Sweden,518,1.106
...,...,...,...
135,British Indian Ocean Territory,1,0.002
136,China,1,0.002
137,Uzbekistan,1,0.002
138,Bulgaria,1,0.002


### Image stats

In [336]:
folder_path = 'input/merged_dataset/images_merged'
dimensions = []
resolutions = []
counter = 0
for filename in os.listdir(folder_path):

    file_path = os.path.join(folder_path, filename)

    if file_path.lower().endswith(('.png')):
        try:
            # Open the image and get its dimensions
            with Image.open(file_path) as img:
                width, height = img.size
                # Add the dimensions to the set
                dimensions.append((width, height))
                dpi = img.info.get('dpi', None)
                if dpi:
                    resolutions.append(dpi)
                counter += 1
        except Exception as e:
            print(f"Error processing {filename}: {e}")



In [337]:
tuple_counts = Counter(resolutions)
tuple_dimensions = Counter(dimensions)
print(f"Number of images {counter}")
print(f"Number of images with available defined {len(resolutions)}")


Number of images 46844
Number of images with available defined 20622


In [330]:
dimensions_data = [(k[0], k[1], v) for k, v in tuple_dimensions.items()]
dimensions_df = pd.DataFrame(dimensions_data, columns=['x', 'y', 'count'])
dimensions_df = dimensions_df.sort_values('count', ascending=False)
dimensions_df['Percentage'] = round((dimensions_df['count'] * 100) / len(dimensions), 3)

In [333]:
dimensions_df

Unnamed: 0,x,y,count,Percentage
1,200,200,15688,33.490
0,300,300,13627,29.090
8,300,299,3089,6.594
3,130,130,1755,3.746
19,300,298,1462,3.121
...,...,...,...,...
313,160,155,1,0.002
86,300,231,1,0.002
311,300,131,1,0.002
310,228,202,1,0.002


In [335]:
resolutions_data = [(k[0], k[1], v) for k, v in tuple_counts.items()]
resolution_df = pd.DataFrame(resolutions_data, columns=['x', 'y', 'count'])
resolution_df = resolution_df.sort_values('count', ascending=False)
resolution_df['Percentage'] = round((resolution_df['count'] * 100) / len(resolutions), 3)
resolution_df

Unnamed: 0,x,y,count,Percentage
0,72,72,11135,53.996
1,300,300,7517,36.451
2,96,96,1747,8.472
5,200,200,61,0.296
7,100,100,56,0.272
8,150,150,20,0.097
9,600,600,11,0.053
4,762,762,9,0.044
15,400,400,8,0.039
11,157,157,6,0.029
