## Imports

In [None]:
import pandas as pd
import csv
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import os
from collections import Counter

### Functions

In [None]:
def download_image(image_url, local_file_name):
    response = requests.get(image_url)
    if response.status_code == 200:
        with open(local_file_name, 'wb') as file:
            file.write(response.content)
        #print(f"Image downloaded successfully: {local_file_name}")
    else:
        print(f"Error: Failed to download image. Status code: {response.status_code}")

def save_msdi_images_from_dataframe(path, df):
    for index, row in df.iterrows():
        track_name = row['msd_track_id']
        album_index = row['album_index']
        image_url = row['image_url']
        local_file_name = f"{path}/{album_index}_{track_name}.png"
        download_image(image_url, local_file_name)

def save_mumu_images_from_dataframe(path, df):
    for index, row in df.iterrows():
        track_name = row['amazon_id']
        album_index = index
        image_url = row['imUrl']
        local_file_name = f"{path}/{album_index}_{track_name}.png"
        if index > 27115:
            download_image(image_url, local_file_name)


# MSDI dataset

In [35]:
# Read the msdi dataset
input_msdi_path = "input/MSDI_dataset"
df = pd.read_csv(f"{input_msdi_path}/MSD-I_dataset.tsv", sep='\t', header=0)
print(df.count())

# Remove duplicates

df_unique = df.drop_duplicates(subset='album_index')
print("-------- after duplicates deletion --------")
print(df_unique.count())


msd_track_id     30713
genre            30713
album_index      30713
set              30713
msd_artist_id    30713
image_url        30713
dtype: int64
-------- after duplicates deletion --------
msd_track_id     16753
genre            16753
album_index      16753
set              16753
msd_artist_id    16753
image_url        16753
dtype: int64


In [216]:
df_unique

Unnamed: 0,msd_track_id,genre,album_index,set,msd_artist_id,image_url
0,TRABKJU128F422A7FE,Metal,0,train,ARBD4QW1187FB42153,http://artwork-cdn.7static.com/static/img/slee...
1,TRBLDQQ128F92E58B4,Rock,1,train,AR3RK011187FB3CE3B,http://artwork-cdn.7static.com/static/img/slee...
2,TRDMMDE128F14A9052,Rock,2,train,ARJVTRE1187B9959C0,http://artwork-cdn.7static.com/static/img/slee...
3,TRJOPZB128F4250E02,Rock,4,train,AR62BB21187B9AC83D,http://artwork-cdn.7static.com/static/img/slee...
4,TRJKBVL128F935567B,Rock,5,train,AR7GVOV1187B9B5FF1,http://artwork-cdn.7static.com/static/img/slee...
...,...,...,...,...,...,...
30701,TRKHBDC12903CD8949,Electronic,22332,test,ARGSRW21187B992282,http://artwork-cdn.7static.com/static/img/slee...
30702,TRDGJQT128F42632E3,Electronic,22337,test,ARJH2B61187B9B9465,http://artwork-cdn.7static.com/static/img/slee...
30703,TRRESAT128F422A1E6,Electronic,22347,test,ARON0ER1187FB52E2A,http://artwork-cdn.7static.com/static/img/slee...
30707,TRMFLQB12903CE56C2,Blues,22358,test,AR56P361187B9AC4DB,http://artwork-cdn.7static.com/static/img/slee...


In [None]:
# Download images
save_msdi_images_from_dataframe(f"{input_msdi_path}/images_msdi", df_unique)

In [None]:
# Add image name to a new column in dataframe
name = []
for index, row in df_unique.iterrows():
    track_name = row['msd_track_id']
    album_index = row['album_index']
    name.append( f"{album_index}_{track_name}.png")
df_unique['name_image'] = name
df_unique.to_csv(f"{input_msdi_path}/msdi_mapping.csv", index=False)

## Stats

### Genre distribution
Genre distribution, the most common genre in this data set is Rock with 23.22% of the images, followed by Electronic (15.19%) and Pop (10.98%)

In [5]:
value_counts = df_unique['genre'].value_counts()
value_counts_df = value_counts.reset_index()
value_counts_df['Percentage'] = round((value_counts_df['count'] * 100) / 16753, 3)
value_counts_df

Unnamed: 0,genre,count,Percentage
0,Rock,3890,23.22
1,Electronic,2544,15.185
2,Pop,1840,10.983
3,Jazz,1341,8.005
4,Rap,1232,7.354
5,Metal,1140,6.805
6,RnB,970,5.79
7,Reggae,829,4.948
8,Country,802,4.787
9,Folk,541,3.229


### Images stats
The size of the images are 200x200

In [37]:
folder_path = f"{input_msdi_path}/images_msdi"
unique_dimensions = set()
resolutions = []

for filename in os.listdir(folder_path):

    file_path = os.path.join(folder_path, filename)

    if file_path.lower().endswith(('.png')):
        try:
            # Open the image and get its dimensions
            with Image.open(file_path) as img:
                width, height = img.size
                # Add the dimensions to the set
                unique_dimensions.add((width, height))
                dpi = img.info.get('dpi', None)
                if dpi:
                    resolutions.append(dpi)
        except Exception as e:
            print(f"Error processing {filename}: {e}")



In [253]:
tuple_counts = Counter(resolutions)
print(f"Number of images with available defined {len(resolutions)}")
print(f"Dimension of images {unique_dimensions}")


Number of images with resolution defined 14287
Dimension of images {(200, 200)}


In [254]:
resolutions_data = [(k[0], k[1], v) for k, v in tuple_counts.items()]
resolution_df = pd.DataFrame(resolutions_data, columns=['x', 'y', 'count'])
resolution_df = resolution_df.sort_values('count', ascending=False)
resolution_df['Percentage'] = round((resolution_df['count'] * 100) / len(resolutions), 3)
resolution_df

Unnamed: 0,x,y,count,Percentage
0,300,300,8030,56.205
1,72,72,4174,29.215
2,96,96,1868,13.075
5,200,200,64,0.448
7,100,100,54,0.378
9,150,150,18,0.126
17,600,600,11,0.077
4,762,762,10,0.07
15,400,400,8,0.056
18,6,6,8,0.056


# MuMu dataset

In [40]:
# Read json with information
mumu_input_path = "input/MuMu_dataset"
json_file_path = f'{mumu_input_path}/amazon_metadata_MuMu.json' 
df = pd.read_json(json_file_path)
print(df.count())



title         25729
price         27057
imUrl         31471
amazon_id     31471
related       31471
categories    31471
salesRank     31359
brand          4874
dtype: int64


In [41]:
# Remove duplicates
print("-------- after duplicates deletion --------")
df_unique = df.drop_duplicates(subset='imUrl')
print(df_unique.count())

-------- after duplicates deletion --------
title         25507
price         26936
imUrl         31225
amazon_id     31225
related       31225
categories    31225
salesRank     31129
brand          4872
dtype: int64


In [None]:
# Save images from the dataset
save_mumu_images_from_dataframe(f'{mumu_input_path}/images_mumu',df_unique)

For this datasets we have two kind of genres mapping, one for albums with a single genre and another one for albums with multiples genres, to unified everything a join was done based on the amazon_id

In [42]:
# Drop the unimportant columns from the initial dataset
df_unique = df_unique[['amazon_id', 'imUrl']]

# Read the single label classification dataset
df_single = pd.read_csv(f'{mumu_input_path}/MuMu_dataset_single-label.csv', header=0).drop_duplicates(subset='amazon_id')

df_single['amazon_id_si'] = df_single['amazon_id']
df_single['genres_si'] = df_single['genres']

# Drop unimportant columns for join and rename the identifier
df_single_label = df_single[['amazon_id_si', 'genres_si']]


# Read the multi label classification dataset
df_multi = pd.read_csv(f'{mumu_input_path}/MuMu_dataset_multi-label.csv', header=0).drop_duplicates(subset='album_mbid')

df_multi['amazon_id_mu'] = df_multi['amazon_id']
df_multi['genres_mu'] = df_multi['genres']

# Drop unimportant columns for join and rename the identifier
df_multi_label = df_multi[['amazon_id_mu', 'genres_mu']]

In [21]:
# Join datasets
df_joined = pd.merge(left=df_unique, right=df_single_label, left_on='amazon_id', right_on='amazon_id_si', how='left')
df_joined = pd.merge(left=df_joined, right=df_multi_label, left_on='amazon_id', right_on='amazon_id_mu', how='left')

In [22]:
df_joined

Unnamed: 0,amazon_id,imUrl,amazon_id_si,genres_si,amazon_id_mu,genres_mu
0,1458389375,http://ecx.images-amazon.com/images/I/51fdvJLW...,,,1458389375,"Jazz,Pop"
1,1591791065,http://ecx.images-amazon.com/images/I/51GGK0zo...,,,1591791065,"New Age,Dance Pop,World Music,Pop,Classical"
2,1906063443,http://ecx.images-amazon.com/images/I/51Li1pqK...,,,1906063443,"Europe,Christian,Eastern Europe,Pop,Gypsy,Worl..."
3,1929243766,http://ecx.images-amazon.com/images/I/51eW8XLF...,,,1929243766,"Comedy & Spoken Word,Pop"
4,1930864159,http://ecx.images-amazon.com/images/I/41F5hqeP...,,,1930864159,"Pop & Contemporary,Christian,Gospel,Pop"
...,...,...,...,...,...,...
31220,B00JVSIMLQ,http://ecx.images-amazon.com/images/I/21o%2B73...,B00JVSIMLQ,Latin Music,B00JVSIMLQ,"Latin Music,Latin Pop"
31221,B00KDL9TTE,http://ecx.images-amazon.com/images/I/316q03mN...,,,B00KDL9TTE,"Pop,Easy Listening"
31222,B00KE5M5IQ,http://ecx.images-amazon.com/images/I/51UgnZe%...,B00KE5M5IQ,Latin Music,B00KE5M5IQ,"Latin Music,Latin Pop"
31223,B00KG10ZXU,http://ecx.images-amazon.com/images/I/51obnT4n...,,,B00KG10ZXU,"Broadway & Vocalists,Vocal Pop,Pop"


To get the genres individually for each album the genres column is divided based on ",", to create a list and then the list is pivoted

In [23]:
df_joined['genres_list'] = df_joined['genres_mu'].str.split(',')
df_exploded = df_joined.explode('genres_list')

df_genres = pd.get_dummies(df_exploded['genres_list'], prefix='genre')
df_genres_grouped = df_genres.groupby(df_exploded['amazon_id']).sum().reset_index()
df_final = pd.merge(df_joined, df_genres_grouped, on='amazon_id', how='left')

df_final['genres_list'] = df_final['genres_list'].apply(len)


In [24]:
df_final

Unnamed: 0,amazon_id,imUrl,amazon_id_si,genres_si,amazon_id_mu,genres_mu,genres_list,genre_ Elegies & Tombeau,genre_ Incidental & Program Music,genre_Accompaniment,...,genre_Voices,genre_Waltzes,genre_Wedding Music,genre_West Coast,genre_West Coast Blues,genre_Western Swing,genre_World Dance,genre_World Music,genre_Zimbabwe,genre_Zouk
0,1458389375,http://ecx.images-amazon.com/images/I/51fdvJLW...,,,1458389375,"Jazz,Pop",2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1591791065,http://ecx.images-amazon.com/images/I/51GGK0zo...,,,1591791065,"New Age,Dance Pop,World Music,Pop,Classical",5,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1906063443,http://ecx.images-amazon.com/images/I/51Li1pqK...,,,1906063443,"Europe,Christian,Eastern Europe,Pop,Gypsy,Worl...",7,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1929243766,http://ecx.images-amazon.com/images/I/51eW8XLF...,,,1929243766,"Comedy & Spoken Word,Pop",2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1930864159,http://ecx.images-amazon.com/images/I/41F5hqeP...,,,1930864159,"Pop & Contemporary,Christian,Gospel,Pop",4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31220,B00JVSIMLQ,http://ecx.images-amazon.com/images/I/21o%2B73...,B00JVSIMLQ,Latin Music,B00JVSIMLQ,"Latin Music,Latin Pop",2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31221,B00KDL9TTE,http://ecx.images-amazon.com/images/I/316q03mN...,,,B00KDL9TTE,"Pop,Easy Listening",2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31222,B00KE5M5IQ,http://ecx.images-amazon.com/images/I/51UgnZe%...,B00KE5M5IQ,Latin Music,B00KE5M5IQ,"Latin Music,Latin Pop",2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31223,B00KG10ZXU,http://ecx.images-amazon.com/images/I/51obnT4n...,,,B00KG10ZXU,"Broadway & Vocalists,Vocal Pop,Pop",3,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Stats

First we take the single labels and see how is the distribution of this, Dance & Electronic is the most common genre with a percentage of 22.55%

In [28]:
single_label_genre = df_final['genres_si'].value_counts()
single_label_genre_df = single_label_genre.reset_index()
single_label_genre_df['Percentage'] = round((single_label_genre_df['count'] * 100) / sum(single_label_genre_df['count']), 2)
single_label_genre_df

Unnamed: 0,genres_si,count,Percentage
0,Dance & Electronic,506,22.55
1,Rock,379,16.89
2,Jazz,257,11.45
3,Alternative Rock,248,11.05
4,Country,148,6.6
5,Pop,132,5.88
6,R&B,120,5.35
7,Metal,115,5.12
8,Rap & Hip-Hop,100,4.46
9,Latin Music,92,4.1


Now we analyse the multi label classification for this, we count how many genres are defined in the dataset

In [32]:
genre_counts = {col.split('_')[1]: df_final[col].sum() for col in df_final.columns if col.startswith('genre_')}

print(f"Number of genres individually {len(genre_counts)}")

Number of genres individually 446


The we calculate how many images corresponds to which genres, the percentage here doesn't sum one because is over the total images, and an image can have multiple genres. Most of the images are classified as Pop (83.1%) followed by Rock (52.282%)

In [33]:
genre_counts_df = pd.DataFrame(list(genre_counts.items()), columns=['Genre', 'Count'])
genre_counts_df_sorted = genre_counts_df.sort_values('Count', ascending=False)
genre_counts_df_sorted.reset_index(drop=True, inplace=True)
genre_counts_df_sorted['Percentage'] = round((genre_counts_df_sorted['Count'] * 100) / len(df_final), 3)

genre_counts_df_sorted #

Unnamed: 0,Genre,Count,Percentage
0,Pop,25948,83.100
1,Rock,16325,52.282
2,Alternative Rock,8157,26.123
3,World Music,6333,20.282
4,Dance & Electronic,4706,15.071
...,...,...,...
441,Passions,1,0.003
442,Polynesian Music,1,0.003
443,Tahiti,1,0.003
444,Tangos,1,0.003


Now we check how many labels a image have and check the distribution of this, most albums are classified with 4, 5 and 6 genres, this can help to discard albums with more that a certain number of genres

In [34]:
number_of_multi_defined_genres = df_final['genres_list'].value_counts()
number_of_multi_defined_genres_df = number_of_multi_defined_genres.reset_index()
number_of_multi_defined_genres_df['Percentage'] = round((number_of_multi_defined_genres_df['count'] * 100) / len(df_final), 3)
number_of_multi_defined_genres_df

Unnamed: 0,genres_list,count,Percentage
0,4,4267,13.665
1,5,4232,13.553
2,6,4133,13.236
3,7,3577,11.456
4,3,3064,9.813
5,8,2756,8.826
6,2,2078,6.655
7,9,1971,6.312
8,10,1497,4.794
9,1,1086,3.478


In [206]:
# Save the merged dataframe, and add the naming mapping for the downloaded images
name = []
for index, row in df_final.iterrows():
    track_name = row['amazon_id']
    album_index = index
    name.append(f"{album_index}_{track_name}")

df_final['name_image']= name
df_final.to_csv(f'{mumu_input_path}/mumu_mapping.csv', index=False)

### Images stats

In [43]:
folder_path = f'{mumu_input_path}/images_mumu'
dimensions = []
resolutions = []

for filename in os.listdir(folder_path):

    file_path = os.path.join(folder_path, filename)

    if file_path.lower().endswith(('.png')):
        try:
            # Open the image and get its dimensions
            with Image.open(file_path) as img:
                width, height = img.size
                # Add the dimensions to the set
                dimensions.append((width, height))
                dpi = img.info.get('dpi', None)
                if dpi:
                    resolutions.append(dpi)
        except Exception as e:
            print(f"Error processing {filename}: {e}")



In [250]:
tuple_counts = Counter(resolutions)
tuple_dimensions = Counter(dimensions)
print(f"Number of images with available defined {len(resolutions)}")


Number of images with resolution defined 7300


The size of the images are mostly 300x300, but there are multiples sizes of it

In [247]:
dimensions_data = [(k[0], k[1], v) for k, v in tuple_dimensions.items()]
dimensions_df = pd.DataFrame(dimensions_data, columns=['x', 'y', 'count'])
dimensions_df = dimensions_df.sort_values('count', ascending=False)
dimensions_df['Percentage'] = round((dimensions_df['count'] * 100) / len(dimensions), 3)

In [248]:
dimensions_df

Unnamed: 0,x,y,count,Percentage
6,300,300,13627,43.641
7,300,299,3089,9.893
0,130,130,1755,5.620
10,300,298,1462,4.682
18,299,300,809,2.591
...,...,...,...,...
312,300,198,1,0.003
311,173,300,1,0.003
310,150,131,1,0.003
309,300,245,1,0.003


The most common resolution is 72x72

In [249]:
resolutions_data = [(k[0], k[1], v) for k, v in tuple_counts.items()]
resolution_df = pd.DataFrame(resolutions_data, columns=['x', 'y', 'count'])
resolution_df = resolution_df.sort_values('count', ascending=False)
resolution_df['Percentage'] = round((resolution_df['count'] * 100) / len(resolutions), 3)
resolution_df

Unnamed: 0,x,y,count,Percentage
0,72,72,7252,99.342
2,300,300,16,0.219
1,96,96,7,0.096
4,157,157,6,0.082
5,120,120,3,0.041
7,150,150,3,0.041
8,200,200,3,0.041
10,95,95,3,0.041
3,75,75,2,0.027
6,100,100,2,0.027
