## Imports

In [65]:
import pandas as pd
import csv
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import os
from collections import Counter
import h5py
import requests
import reverse_geocoder as rg

### Functions

In [26]:
def get_dataframes_from_msd(input_path, dataset_name):
    
    with h5py.File(input_path, 'r') as file:
        # Access the 'songs' dataset within the 'musicbrainz' group
        dataset = file[f'{dataset_name}/songs']
        
        # Convert the dataset to a NumPy array
        data_array = dataset[:]
        
        # Convert the NumPy array to a Pandas DataFrame
        # If the dataset has named fields, use them as column names
        if dataset.dtype.fields is not None:
            column_names = dataset.dtype.names
            df = pd.DataFrame(data_array, columns=column_names)
        else:
            # If there are no named fields, create a default range of column names
            df = pd.DataFrame(data_array)

    return  df

In [27]:
def download_image(image_url, local_file_name):
    response = requests.get(image_url)
    if response.status_code == 200:
        with open(local_file_name, 'wb') as file:
            file.write(response.content)
        #print(f"Image downloaded successfully: {local_file_name}")
    else:
        print(f"Error: Failed to download image. Status code: {response.status_code}")

def save_msdi_images_from_dataframe(path, df):
    for index, row in df.iterrows():
        track_name = row['msd_track_id']
        album_index = row['album_index']
        image_url = row['image_url']
        local_file_name = f"{path}/{album_index}_{track_name}.png"
        download_image(image_url, local_file_name)

def save_mumu_images_from_dataframe(path, df):
    for index, row in df.iterrows():
        track_name = row['amazon_id']
        album_index = index
        image_url = row['imUrl']
        local_file_name = f"{path}/{album_index}_{track_name}.png"
        if index > 27115:
            download_image(image_url, local_file_name)


### Million songs dataset pre processing

Here we separated the different sub_datasets in the h5 file

In [46]:
msd_input_path = "input/msd_summary_file.h5"
musicbrainz_df = get_dataframes_from_msd(msd_input_path, 'musicbrainz')
analysis_df = get_dataframes_from_msd(msd_input_path, 'analysis')
metadata_df = get_dataframes_from_msd(msd_input_path, 'metadata')

Get the relevant columns and decode them

In [47]:
musicbrainz_df = musicbrainz_df[['year']]

analysis_df = analysis_df[['track_id']]
analysis_df['track_id'] = analysis_df['track_id'].str.decode('utf-8')

metadata_df = metadata_df[['artist_id', 'artist_name', 'release', 'song_id', 'title', 'artist_location','artist_latitude','artist_longitude']]
metadata_df['artist_id'] = metadata_df['artist_id'].str.decode('utf-8')
metadata_df['artist_name'] = metadata_df['artist_name'].str.decode('utf-8')
metadata_df['release'] = metadata_df['release'].str.decode('utf-8')
metadata_df['song_id'] = metadata_df['song_id'].str.decode('utf-8')
metadata_df['title'] = metadata_df['title'].str.decode('utf-8')
metadata_df['artist_location'] = metadata_df['artist_location'].str.decode('utf-8')

In [38]:
musicbrainz_df

Unnamed: 0,year
0,2003
1,1995
2,2006
3,2003
4,0
...,...
999995,0
999996,0
999997,0
999998,0


In [None]:
analysis_df

In [None]:
metadata_df

Now we concatenat them 

In [60]:
concatenated_df = pd.concat([analysis_df, musicbrainz_df, metadata_df], axis=1)
concatenated_df

Unnamed: 0,track_id,year,artist_id,artist_name,release,song_id,title,artist_location,artist_latitude,artist_longitude
0,TRMMMYQ128F932D901,2003,ARYZTJS1187B98C555,Faster Pussy cat,Monster Ballads X-Mas,SOQMMHC12AB0180CB8,Silent Night,,,
1,TRMMMKD128F425225D,1995,ARMVN3U1187FB3A1EB,Karkkiautomaatti,Karkuteillä,SOVFVAK12A8C1350D9,Tanssi vaan,,,
2,TRMMMRX128F93187D9,2006,ARGEKB01187FB50750,Hudson Mohawke,Butter,SOGTUKN12AB017F4F1,No One Could Ever,"Glasgow, Scotland",55.8578,-4.24251
3,TRMMMCH128F425532C,2003,ARNWYLR1187B9B2F9C,Yerba Brava,De Culo,SOBNYVR12A8C13558C,Si Vos Querés,,,
4,TRMMMWA128F426B589,0,AREQDTE1269FB37231,Der Mystic,Rene Ablaze Presents Winter Sessions,SOHSBXH12A8C13B0DF,Tangle Of Aspens,,,
...,...,...,...,...,...,...,...,...,...,...
999995,TRYYYUS12903CD2DF0,0,AR7Z4J81187FB3FC59,Kiko Navarro,Pacha V.I.P.,SOTXAME12AB018F136,O Samba Da Vida,,,
999996,TRYYYJO128F426DA37,0,ART5FZD1187B9A7FCF,Kuldeep Manak,Naale Baba Lassi Pee Gya,SOXQYIQ12A8C137FBB,Jago Chhadeo,,,
999997,TRYYYMG128F4260ECA,0,ARZ3R6M1187B9AF750,Gabriel Le Mar,Dub_Connected: electronic music,SOHODZI12A8C137BB3,Novemba,GERMANY,,
999998,TRYYYDJ128F9310A21,0,ARCMCOK1187B9B1073,Elude,The Trance Collection Vol. 2,SOLXGOR12A81C21EB7,Faraday,,,


# MuMu dataset

In [56]:
# Read json with information
mumu_input_path = "input/MuMu_dataset"
json_file_path = f'{mumu_input_path}/amazon_metadata_MuMu.json' 
df = pd.read_json(json_file_path)
print(df.count())



title         25729
price         27057
imUrl         31471
amazon_id     31471
related       31471
categories    31471
salesRank     31359
brand          4874
dtype: int64


In [57]:
# Remove duplicates
print("-------- after duplicates deletion --------")
df_unique = df.drop_duplicates(subset='imUrl')
print(df_unique.count())

-------- after duplicates deletion --------
title         25507
price         26936
imUrl         31225
amazon_id     31225
related       31225
categories    31225
salesRank     31129
brand          4872
dtype: int64


In [58]:
# Save images from the dataset
#save_mumu_images_from_dataframe(f'{mumu_input_path}/images_mumu',df_unique)

For this datasets we have two kind of genres mapping, one for albums with a single genre and another one for albums with multiples genres, to unified everything a join was done based on the amazon_id

In [61]:
# Drop the unimportant columns from the initial dataset
df_unique = df_unique[['amazon_id', 'imUrl']]

# Read the single label classification dataset
df_single = pd.read_csv(f'{mumu_input_path}/MuMu_dataset_single-label.csv', header=0).drop_duplicates(subset='amazon_id')

df_single['amazon_id_si'] = df_single['amazon_id']
df_single['genres_si'] = df_single['genres']
df_single['MSD_track_id_si'] = df_single['MSD_track_id']

# Drop unimportant columns for join and rename the identifier
df_single_label = df_single[['amazon_id_si', 'genres_si', 'MSD_track_id_si']]


# Read the multi label classification dataset
df_multi = pd.read_csv(f'{mumu_input_path}/MuMu_dataset_multi-label.csv', header=0).drop_duplicates(subset='album_mbid')

df_multi['amazon_id_mu'] = df_multi['amazon_id']
df_multi['genres_mu'] = df_multi['genres']

# Drop unimportant columns for join and rename the identifier
df_multi_label = df_multi[['amazon_id_mu', 'genres_mu','MSD_track_id']]

In [62]:
# Join datasets
df_joined = pd.merge(left=df_unique, right=df_single_label, left_on='amazon_id', right_on='amazon_id_si', how='left')
df_joined = pd.merge(left=df_joined, right=df_multi_label, left_on='amazon_id', right_on='amazon_id_mu', how='left')

In [63]:
df_joined

Unnamed: 0,amazon_id,imUrl,amazon_id_si,genres_si,MSD_track_id_si,amazon_id_mu,genres_mu,MSD_track_id
0,1458389375,http://ecx.images-amazon.com/images/I/51fdvJLW...,,,,1458389375,"Jazz,Pop",TRCQFGP12903CB68CA
1,1591791065,http://ecx.images-amazon.com/images/I/51GGK0zo...,,,,1591791065,"New Age,Dance Pop,World Music,Pop,Classical",TRBQZDC128F42730B5
2,1906063443,http://ecx.images-amazon.com/images/I/51Li1pqK...,,,,1906063443,"Europe,Christian,Eastern Europe,Pop,Gypsy,Worl...",TROIGUJ128F4292F75
3,1929243766,http://ecx.images-amazon.com/images/I/51eW8XLF...,,,,1929243766,"Comedy & Spoken Word,Pop",TRFOCFX128F4290747
4,1930864159,http://ecx.images-amazon.com/images/I/41F5hqeP...,,,,1930864159,"Pop & Contemporary,Christian,Gospel,Pop",TRIRPDY128F4258BD6
...,...,...,...,...,...,...,...,...
31220,B00JVSIMLQ,http://ecx.images-amazon.com/images/I/21o%2B73...,B00JVSIMLQ,Latin Music,TRDKOZX128F9348101,B00JVSIMLQ,"Latin Music,Latin Pop",TRDKOZX128F9348101
31221,B00KDL9TTE,http://ecx.images-amazon.com/images/I/316q03mN...,,,,B00KDL9TTE,"Pop,Easy Listening",TRSFZAQ128F427F0E1
31222,B00KE5M5IQ,http://ecx.images-amazon.com/images/I/51UgnZe%...,B00KE5M5IQ,Latin Music,TRBXVJR128F425C9DF,B00KE5M5IQ,"Latin Music,Latin Pop",TRBXVJR128F425C9DF
31223,B00KG10ZXU,http://ecx.images-amazon.com/images/I/51obnT4n...,,,,B00KG10ZXU,"Broadway & Vocalists,Vocal Pop,Pop",TRWJYXQ128F4270764


In [64]:
df_joined = pd.merge(left=df_joined, right=concatenated_df, left_on='MSD_track_id', right_on='track_id', how='left')
df_joined

Unnamed: 0,amazon_id,imUrl,amazon_id_si,genres_si,MSD_track_id_si,amazon_id_mu,genres_mu,MSD_track_id,track_id,year,artist_id,artist_name,release,song_id,title,artist_location,artist_latitude,artist_longitude
0,1458389375,http://ecx.images-amazon.com/images/I/51fdvJLW...,,,,1458389375,"Jazz,Pop",TRCQFGP12903CB68CA,TRCQFGP12903CB68CA,0,AROC6N61187FB40B14,Steven Halpern_ Steven Halpern,Comfort Zone,SOHZONF12AB0185BA6,Comfort Zone (Part 1),,,
1,1591791065,http://ecx.images-amazon.com/images/I/51GGK0zo...,,,,1591791065,"New Age,Dance Pop,World Music,Pop,Classical",TRBQZDC128F42730B5,TRBQZDC128F42730B5,0,ARY92411187FB46E82,Jai Uttal,Mondo Rama,SOFBESN12AF729DE2F,Shri Krishna,California - SF,37.77916,-122.42005
2,1906063443,http://ecx.images-amazon.com/images/I/51Li1pqK...,,,,1906063443,"Europe,Christian,Eastern Europe,Pop,Gypsy,Worl...",TROIGUJ128F4292F75,TROIGUJ128F4292F75,2002,AREXUHS1187B9A763B,Musafir,Barsaat,SOWDAAL12A8C13DB67,Barish,,,
3,1929243766,http://ecx.images-amazon.com/images/I/51eW8XLF...,,,,1929243766,"Comedy & Spoken Word,Pop",TRFOCFX128F4290747,TRFOCFX128F4290747,0,AR8TFCF1187B98FC82,John Pinette,Making Lite Of Myself,SOTVHJG12A58A7C84F,Toilet Paper,California - LA,34.05349,-118.24532
4,1930864159,http://ecx.images-amazon.com/images/I/41F5hqeP...,,,,1930864159,"Pop & Contemporary,Christian,Gospel,Pop",TRIRPDY128F4258BD6,TRIRPDY128F4258BD6,0,ARAPI451187B9B6E6F,John Michael Talbot,The Quiet,SOCHZQH12A8C137745,Sunset,"Oklahoma City, OK",35.47200,-97.52033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31220,B00JVSIMLQ,http://ecx.images-amazon.com/images/I/21o%2B73...,B00JVSIMLQ,Latin Music,TRDKOZX128F9348101,B00JVSIMLQ,"Latin Music,Latin Pop",TRDKOZX128F9348101,TRDKOZX128F9348101,2004,ARHQS6Y1187FB468A3,Panteón Rococó,Un Panteon Muy Vivo,SOCRSKK12AB018528D,Cumbia Del Olvido,,,
31221,B00KDL9TTE,http://ecx.images-amazon.com/images/I/316q03mN...,,,,B00KDL9TTE,"Pop,Easy Listening",TRSFZAQ128F427F0E1,TRSFZAQ128F427F0E1,1995,ARGR9FY1187FB4D0E7,Aberdeen,Fireworks & Other Singles,SOBNAVF12A8C13E330,Fireworks,"Poza Rica, MX",,
31222,B00KE5M5IQ,http://ecx.images-amazon.com/images/I/51UgnZe%...,B00KE5M5IQ,Latin Music,TRBXVJR128F425C9DF,B00KE5M5IQ,"Latin Music,Latin Pop",TRBXVJR128F425C9DF,TRBXVJR128F425C9DF,2000,ARCEXLE1187FB3A93E,Marisa Monte/Par.Especial:Arnaldo Antunes,Memorias Cronicas E Declaracoes De Amor Textos...,SOMSLZE12A8C137217,Gentileza,,,
31223,B00KG10ZXU,http://ecx.images-amazon.com/images/I/51obnT4n...,,,,B00KG10ZXU,"Broadway & Vocalists,Vocal Pop,Pop",TRWJYXQ128F4270764,TRWJYXQ128F4270764,1998,ARH6XJQ1187B989438,Jerry Vale,Sings The Great Italian Hits,SOIGNNH12A8C136AD9,More,"New York, NY [The Bronx]",40.85715,-73.85678


Get countries based on latitude and location coordinates

In [None]:
def get_country_from_coordinates(row):
    # Check if the coordinates are NaN or not numbers
    try:
        # Convert to floats
        lat = float(row['artist_latitude'])
        lon = float(row['artist_longitude'])
    except (ValueError, TypeError):
        return np.nan  # Return NaN for invalid coordinate values
    
    # Proceed only if the coordinates are valid numbers
    if not (np.isnan(lat) or np.isnan(lon)):
        try:
            # Perform reverse geocoding
            result = rg.search((lat, lon))
            # Extract the country code
            return result[0]['cc']
        except:
            # Return NaN or some error code if geocoding fails
            return np.nan
    else:
        return np.nan

# Convert the latitude and longitude columns to float, replacing errors with NaN
df_joined['artist_latitude'] = pd.to_numeric(df_joined['artist_latitude'], errors='coerce')
df_joined['artist_longitude'] = pd.to_numeric(df_joined['artist_longitude'], errors='coerce')

# Apply the function to each row
df_joined['country'] = df_joined.apply(get_country_from_coordinates, axis=1)

# Now metadata_df has a new column 'country' with the country code


Loading formatted geocoded file...


In [None]:
# Save the merged dataframe, and add the naming mapping for the downloaded images
name = []
for index, row in df_joined.iterrows():
    track_name = row['amazon_id']
    album_index = index
    local_file_name = f"{album_index}_{track_name}"
    name.append(f"{album_index}_{track_name}")

df_joined['name_image']= name


In [None]:
df_joined

To get the genres individually for each album the genres column is divided based on ",", to create a list and then the list is pivoted

In [None]:
df_joined['genres_list'] = df_joined['genres_mu'].str.split(',')
df_exploded = df_joined.explode('genres_list')

df_genres = pd.get_dummies(df_exploded['genres_list'], prefix='genre')
df_genres_grouped = df_genres.groupby(df_exploded['amazon_id']).sum().reset_index()
df_final = pd.merge(df_joined, df_genres_grouped, on='amazon_id', how='left')

df_final['genres_list'] = df_final['genres_list'].apply(len)


In [None]:
df_final

In [None]:
df_final.to_csv(f'{mumu_input_path}/mumu_mapping_msdi.csv', index=False)

## Stats

First we take the single labels and see how is the distribution of this, Dance & Electronic is the most common genre with a percentage of 22.55%

In [6]:
df_final = pd.read_csv(f'{mumu_input_path}/mumu_mapping_msdi.csv', header=0)

In [7]:
single_label_genre = df_final['genres_si'].value_counts()
single_label_genre_df = single_label_genre.reset_index()
single_label_genre_df['Percentage'] = round((single_label_genre_df['count'] * 100) / sum(single_label_genre_df['count']), 2)
single_label_genre_df

Unnamed: 0,genres_si,count,Percentage
0,Dance & Electronic,506,22.55
1,Rock,379,16.89
2,Jazz,257,11.45
3,Alternative Rock,248,11.05
4,Country,148,6.6
5,Pop,132,5.88
6,R&B,120,5.35
7,Metal,115,5.12
8,Rap & Hip-Hop,100,4.46
9,Latin Music,92,4.1


Now we analyse the multi label classification for this, we count how many genres are defined in the dataset

In [8]:
genre_counts = {col.split('_')[1]: df_final[col].sum() for col in df_final.columns if col.startswith('genre_')}

print(f"Number of genres individually {len(genre_counts)}")

Number of genres individually 446


The we calculate how many images corresponds to which genres, the percentage here doesn't sum one because is over the total images, and an image can have multiple genres. Most of the images are classified as Pop (83.1%) followed by Rock (52.282%)

In [9]:
genre_counts_df = pd.DataFrame(list(genre_counts.items()), columns=['Genre', 'Count'])
genre_counts_df_sorted = genre_counts_df.sort_values('Count', ascending=False)
genre_counts_df_sorted.reset_index(drop=True, inplace=True)
genre_counts_df_sorted['Percentage'] = round((genre_counts_df_sorted['Count'] * 100) / len(df_final), 3)

genre_counts_df_sorted #

Unnamed: 0,Genre,Count,Percentage
0,Pop,25948,83.100
1,Rock,16325,52.282
2,Alternative Rock,8157,26.123
3,World Music,6333,20.282
4,Dance & Electronic,4706,15.071
...,...,...,...
441,Passions,1,0.003
442,Polynesian Music,1,0.003
443,Tahiti,1,0.003
444,Tangos,1,0.003


Now we check how many labels a image have and check the distribution of this, most albums are classified with 4, 5 and 6 genres, this can help to discard albums with more that a certain number of genres

In [10]:
number_of_multi_defined_genres = df_final['genres_list'].value_counts()
number_of_multi_defined_genres_df = number_of_multi_defined_genres.reset_index()
number_of_multi_defined_genres_df['Percentage'] = round((number_of_multi_defined_genres_df['count'] * 100) / len(df_final), 3)
number_of_multi_defined_genres_df

Unnamed: 0,genres_list,count,Percentage
0,4,4267,13.665
1,5,4232,13.553
2,6,4133,13.236
3,7,3577,11.456
4,3,3064,9.813
5,8,2756,8.826
6,2,2078,6.655
7,9,1971,6.312
8,10,1497,4.794
9,1,1086,3.478


We have 80 different release years, and 21.28% of the data doesn't available the release year

In [13]:
year_counts = df_final['year'].value_counts()
year_counts_df = year_counts.reset_index()
year_counts_df['Percentage'] = round((year_counts_df['count'] * 100) / len(df_final), 3)
year_counts_df

Unnamed: 0,year,count,Percentage
0,0,6647,21.287
1,2005,1281,4.102
2,2006,1258,4.029
3,2007,1142,3.657
4,2003,1125,3.603
...,...,...,...
76,1924,1,0.003
77,1931,1,0.003
78,1941,1,0.003
79,1933,1,0.003


There's a even distribution of artists there are 13981 different artists, the most repeated one is Willie Nelson	

In [23]:
artirst_counts = df_final['artist_id'].value_counts()
artirst_counts_df = artirst_counts.reset_index()
artirst_counts_df['Percentage'] = round((artirst_counts_df['count'] * 100) / len(df_final), 3)
artirst_counts_df = pd.merge(left=artirst_counts_df, right=df_final, left_on='artist_id', right_on='artist_id', how='left').drop_duplicates(subset='artist_id')
artirst_counts_df= artirst_counts_df[['artist_id','artist_name','count','Percentage']]
artirst_counts_df

Unnamed: 0,artist_id,artist_name,count,Percentage
0,ARMM6WZ1187FB4958D,Willie Nelson,31,0.099
31,AR6Q4T91187B995616,Sonny Rollins,24,0.077
55,ARPGS671187B996060,McCoy Tyner,23,0.074
78,ARYJSY01187B9B7C65,Dolly Parton,22,0.070
100,ARH861H1187B9B799E,Johnny Cash,22,0.070
...,...,...,...,...
31220,ARPBSZ01187FB502DA,Space Raiders,1,0.003
31221,ARCSFPU1187FB3FCD1,Eighth Wonder,1,0.003
31222,ARWPVI31187FB3AF83,The Wonder Stuff,1,0.003
31223,ARCV2B11187B990E2B,Owls,1,0.003


In [45]:
year_counts = df_final['artist_location'].value_counts()
year_counts_df = year_counts.reset_index()
year_counts_df['Percentage'] = round((year_counts_df['count'] * 100) / len(df_final), 3)
year_counts_df

Unnamed: 0,artist_location,count,Percentage
0,,11215,35.917
1,"New York, NY",651,2.085
2,"London, England",625,2.002
3,"Los Angeles, CA",565,1.809
4,"Chicago, IL",372,1.191
...,...,...,...
2793,Reggio Emilia,1,0.003
2794,"Salisbury, Wiltshire, England",1,0.003
2795,"Bab el-Oued, Algeria",1,0.003
2796,Clemmons North Carolina USA,1,0.003


### Images stats

In [43]:
folder_path = f'{mumu_input_path}/images_mumu'
dimensions = []
resolutions = []

for filename in os.listdir(folder_path):

    file_path = os.path.join(folder_path, filename)

    if file_path.lower().endswith(('.png')):
        try:
            # Open the image and get its dimensions
            with Image.open(file_path) as img:
                width, height = img.size
                # Add the dimensions to the set
                dimensions.append((width, height))
                dpi = img.info.get('dpi', None)
                if dpi:
                    resolutions.append(dpi)
        except Exception as e:
            print(f"Error processing {filename}: {e}")



In [250]:
tuple_counts = Counter(resolutions)
tuple_dimensions = Counter(dimensions)
print(f"Number of images with available defined {len(resolutions)}")


Number of images with resolution defined 7300


The size of the images are mostly 300x300, but there are multiples sizes of it

In [247]:
dimensions_data = [(k[0], k[1], v) for k, v in tuple_dimensions.items()]
dimensions_df = pd.DataFrame(dimensions_data, columns=['x', 'y', 'count'])
dimensions_df = dimensions_df.sort_values('count', ascending=False)
dimensions_df['Percentage'] = round((dimensions_df['count'] * 100) / len(dimensions), 3)

In [248]:
dimensions_df

Unnamed: 0,x,y,count,Percentage
6,300,300,13627,43.641
7,300,299,3089,9.893
0,130,130,1755,5.620
10,300,298,1462,4.682
18,299,300,809,2.591
...,...,...,...,...
312,300,198,1,0.003
311,173,300,1,0.003
310,150,131,1,0.003
309,300,245,1,0.003


The most common resolution is 72x72

In [249]:
resolutions_data = [(k[0], k[1], v) for k, v in tuple_counts.items()]
resolution_df = pd.DataFrame(resolutions_data, columns=['x', 'y', 'count'])
resolution_df = resolution_df.sort_values('count', ascending=False)
resolution_df['Percentage'] = round((resolution_df['count'] * 100) / len(resolutions), 3)
resolution_df

Unnamed: 0,x,y,count,Percentage
0,72,72,7252,99.342
2,300,300,16,0.219
1,96,96,7,0.096
4,157,157,6,0.082
5,120,120,3,0.041
7,150,150,3,0.041
8,200,200,3,0.041
10,95,95,3,0.041
3,75,75,2,0.027
6,100,100,2,0.027
