## Imports

In [12]:
import pandas as pd
import csv
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import os
from collections import Counter
import h5py
import requests
import reverse_geocoder as rg
from utils.location_processor import LocationProcessor 

### Functions

In [34]:
def get_dataframes_from_msd(input_path, dataset_name):
    
    with h5py.File(input_path, 'r') as file:
        # Access the 'songs' dataset within the 'musicbrainz' group
        dataset = file[f'{dataset_name}/songs']
        
        # Convert the dataset to a NumPy array
        data_array = dataset[:]
        
        if dataset.dtype.fields is not None:
            column_names = dataset.dtype.names
            df = pd.DataFrame(data_array, columns=column_names)
        else:
            # If there are no named fields, create a default range of column names
            df = pd.DataFrame(data_array)

    return  df

In [6]:
def download_image(image_url, local_file_name):
    response = requests.get(image_url)
    if response.status_code == 200:
        with open(local_file_name, 'wb') as file:
            file.write(response.content)
        #print(f"Image downloaded successfully: {local_file_name}")
    else:
        print(f"Error: Failed to download image. Status code: {response.status_code}")

def save_msdi_images_from_dataframe(path, df):
    for index, row in df.iterrows():
        track_name = row['msd_track_id']
        album_index = row['album_index']
        image_url = row['image_url']
        local_file_name = f"{path}/{album_index}_{track_name}.png"
        download_image(image_url, local_file_name)

def save_mumu_images_from_dataframe(path, df):
    for index, row in df.iterrows():
        track_name = row['amazon_id']
        album_index = index
        image_url = row['imUrl']
        local_file_name = f"{path}/{album_index}_{track_name}.png"
        if index > 27115:
            download_image(image_url, local_file_name)


### Million songs dataset pre processing

Here we separated the different sub_datasets in the h5 file

In [None]:
msd_input_path = "input/msd_summary_file.h5"
musicbrainz_df = get_dataframes_from_msd(msd_input_path, 'musicbrainz')
analysis_df = get_dataframes_from_msd(msd_input_path, 'analysis')
metadata_df = get_dataframes_from_msd(msd_input_path, 'metadata')

Get the relevant columns and decode them

In [77]:
musicbrainz_df = musicbrainz_df[['year']]

analysis_df = analysis_df[['track_id']]
analysis_df['track_id'] = analysis_df['track_id'].str.decode('utf-8')

metadata_df = metadata_df[['artist_id', 'artist_name', 'release', 'song_id', 'title', 'artist_location','artist_latitude','artist_longitude']]
metadata_df['artist_id'] = metadata_df['artist_id'].str.decode('utf-8')
metadata_df['artist_name'] = metadata_df['artist_name'].str.decode('utf-8')
metadata_df['release'] = metadata_df['release'].str.decode('utf-8')
metadata_df['song_id'] = metadata_df['song_id'].str.decode('utf-8')
metadata_df['title'] = metadata_df['title'].str.decode('utf-8')
metadata_df['artist_location'] = metadata_df['artist_location'].str.decode('utf-8')

In [38]:
musicbrainz_df

Unnamed: 0,year
0,2003
1,1995
2,2006
3,2003
4,0
...,...
999995,0
999996,0
999997,0
999998,0


In [39]:
analysis_df

Unnamed: 0,track_id
0,TRMMMYQ128F932D901
1,TRMMMKD128F425225D
2,TRMMMRX128F93187D9
3,TRMMMCH128F425532C
4,TRMMMWA128F426B589
...,...
999995,TRYYYUS12903CD2DF0
999996,TRYYYJO128F426DA37
999997,TRYYYMG128F4260ECA
999998,TRYYYDJ128F9310A21


In [40]:
metadata_df

Unnamed: 0,artist_id,artist_name,release,song_id,title
0,ARYZTJS1187B98C555,Faster Pussy cat,Monster Ballads X-Mas,SOQMMHC12AB0180CB8,Silent Night
1,ARMVN3U1187FB3A1EB,Karkkiautomaatti,Karkuteillä,SOVFVAK12A8C1350D9,Tanssi vaan
2,ARGEKB01187FB50750,Hudson Mohawke,Butter,SOGTUKN12AB017F4F1,No One Could Ever
3,ARNWYLR1187B9B2F9C,Yerba Brava,De Culo,SOBNYVR12A8C13558C,Si Vos Querés
4,AREQDTE1269FB37231,Der Mystic,Rene Ablaze Presents Winter Sessions,SOHSBXH12A8C13B0DF,Tangle Of Aspens
...,...,...,...,...,...
999995,AR7Z4J81187FB3FC59,Kiko Navarro,Pacha V.I.P.,SOTXAME12AB018F136,O Samba Da Vida
999996,ART5FZD1187B9A7FCF,Kuldeep Manak,Naale Baba Lassi Pee Gya,SOXQYIQ12A8C137FBB,Jago Chhadeo
999997,ARZ3R6M1187B9AF750,Gabriel Le Mar,Dub_Connected: electronic music,SOHODZI12A8C137BB3,Novemba
999998,ARCMCOK1187B9B1073,Elude,The Trance Collection Vol. 2,SOLXGOR12A81C21EB7,Faraday


Now we concatenat them 

In [78]:
concatenated_df = pd.concat([analysis_df, musicbrainz_df, metadata_df], axis=1)
concatenated_df

Unnamed: 0,track_id,year,artist_id,artist_name,release,song_id,title,artist_location,artist_latitude,artist_longitude
0,TRMMMYQ128F932D901,2003,ARYZTJS1187B98C555,Faster Pussy cat,Monster Ballads X-Mas,SOQMMHC12AB0180CB8,Silent Night,,,
1,TRMMMKD128F425225D,1995,ARMVN3U1187FB3A1EB,Karkkiautomaatti,Karkuteillä,SOVFVAK12A8C1350D9,Tanssi vaan,,,
2,TRMMMRX128F93187D9,2006,ARGEKB01187FB50750,Hudson Mohawke,Butter,SOGTUKN12AB017F4F1,No One Could Ever,"Glasgow, Scotland",55.8578,-4.24251
3,TRMMMCH128F425532C,2003,ARNWYLR1187B9B2F9C,Yerba Brava,De Culo,SOBNYVR12A8C13558C,Si Vos Querés,,,
4,TRMMMWA128F426B589,0,AREQDTE1269FB37231,Der Mystic,Rene Ablaze Presents Winter Sessions,SOHSBXH12A8C13B0DF,Tangle Of Aspens,,,
...,...,...,...,...,...,...,...,...,...,...
999995,TRYYYUS12903CD2DF0,0,AR7Z4J81187FB3FC59,Kiko Navarro,Pacha V.I.P.,SOTXAME12AB018F136,O Samba Da Vida,,,
999996,TRYYYJO128F426DA37,0,ART5FZD1187B9A7FCF,Kuldeep Manak,Naale Baba Lassi Pee Gya,SOXQYIQ12A8C137FBB,Jago Chhadeo,,,
999997,TRYYYMG128F4260ECA,0,ARZ3R6M1187B9AF750,Gabriel Le Mar,Dub_Connected: electronic music,SOHODZI12A8C137BB3,Novemba,GERMANY,,
999998,TRYYYDJ128F9310A21,0,ARCMCOK1187B9B1073,Elude,The Trance Collection Vol. 2,SOLXGOR12A81C21EB7,Faraday,,,


# MSDI dataset

In [7]:
# Read the msdi dataset
input_msdi_path = "input/MSDI_dataset"
df = pd.read_csv(f"{input_msdi_path}/MSD-I_dataset.tsv", sep='\t', header=0)
print(df.count())

# Remove duplicates

df_unique = df.drop_duplicates(subset='album_index')
print("-------- after duplicates deletion --------")
print(df_unique.count())


msd_track_id     30713
genre            30713
album_index      30713
set              30713
msd_artist_id    30713
image_url        30713
dtype: int64
-------- after duplicates deletion --------
msd_track_id     16753
genre            16753
album_index      16753
set              16753
msd_artist_id    16753
image_url        16753
dtype: int64


In [80]:
df_unique

Unnamed: 0,msd_track_id,genre,album_index,set,msd_artist_id,image_url
0,TRABKJU128F422A7FE,Metal,0,train,ARBD4QW1187FB42153,http://artwork-cdn.7static.com/static/img/slee...
1,TRBLDQQ128F92E58B4,Rock,1,train,AR3RK011187FB3CE3B,http://artwork-cdn.7static.com/static/img/slee...
2,TRDMMDE128F14A9052,Rock,2,train,ARJVTRE1187B9959C0,http://artwork-cdn.7static.com/static/img/slee...
3,TRJOPZB128F4250E02,Rock,4,train,AR62BB21187B9AC83D,http://artwork-cdn.7static.com/static/img/slee...
4,TRJKBVL128F935567B,Rock,5,train,AR7GVOV1187B9B5FF1,http://artwork-cdn.7static.com/static/img/slee...
...,...,...,...,...,...,...
30701,TRKHBDC12903CD8949,Electronic,22332,test,ARGSRW21187B992282,http://artwork-cdn.7static.com/static/img/slee...
30702,TRDGJQT128F42632E3,Electronic,22337,test,ARJH2B61187B9B9465,http://artwork-cdn.7static.com/static/img/slee...
30703,TRRESAT128F422A1E6,Electronic,22347,test,ARON0ER1187FB52E2A,http://artwork-cdn.7static.com/static/img/slee...
30707,TRMFLQB12903CE56C2,Blues,22358,test,AR56P361187B9AC4DB,http://artwork-cdn.7static.com/static/img/slee...


In [98]:
# Download images
#save_msdi_images_from_dataframe(f"{input_msdi_path}/images_msdi", df_unique)

In [99]:
df_unique

Unnamed: 0,msd_track_id,genre,album_index,set,msd_artist_id,image_url,name_image,track_id,year,artist_id,artist_name,release,song_id,title,artist_location,artist_latitude,artist_longitude,country
0,TRABKJU128F422A7FE,Metal,0,train,ARBD4QW1187FB42153,http://artwork-cdn.7static.com/static/img/slee...,0_TRABKJU128F422A7FE.png,TRABKJU128F422A7FE,2006,ARBD4QW1187FB42153,Scar Symmetry,Nuclear Blast Showdown 2006,SODUYOC12A67ADC8ED,The Illusionist,Sweden,62.19845,17.55142,SE
1,TRBLDQQ128F92E58B4,Rock,1,train,AR3RK011187FB3CE3B,http://artwork-cdn.7static.com/static/img/slee...,1_TRBLDQQ128F92E58B4.png,TRBLDQQ128F92E58B4,2007,AR3RK011187FB3CE3B,Eddie Vedder,Music For The Motion Picture Into The Wild,SODUDJI12A8C141513,Tuolumne,"Etats-Unis, Illinois, Evanston",42.05665,-87.68659,US
2,TRDMMDE128F14A9052,Rock,2,train,ARJVTRE1187B9959C0,http://artwork-cdn.7static.com/static/img/slee...,2_TRDMMDE128F14A9052.png,TRDMMDE128F14A9052,1981,ARJVTRE1187B9959C0,Foghat,Girls To Chat And Boys To Bounce,SODUUHR12A6D4FB01C,Weekend Driver,"London, England",40.71455,-74.00712,US
3,TRJOPZB128F4250E02,Rock,4,train,AR62BB21187B9AC83D,http://artwork-cdn.7static.com/static/img/slee...,4_TRJOPZB128F4250E02.png,TRJOPZB128F4250E02,2002,AR62BB21187B9AC83D,The Breeders,Title Tk,SODUEZB12A8C1348D5,Too Alive,Boston,42.31256,-71.08868,US
4,TRJKBVL128F935567B,Rock,5,train,AR7GVOV1187B9B5FF1,http://artwork-cdn.7static.com/static/img/slee...,5_TRJKBVL128F935567B.png,TRJKBVL128F935567B,1990,AR7GVOV1187B9B5FF1,Sebadoh,III,SODURCL12A81358260,Ride The Darker Wave,"Amherst, MA",42.37522,-72.51984,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16748,TRKHBDC12903CD8949,Electronic,22332,test,ARGSRW21187B992282,http://artwork-cdn.7static.com/static/img/slee...,22332_TRKHBDC12903CD8949.png,TRKHBDC12903CD8949,0,ARGSRW21187B992282,Lazybatusu,Lazybatusu,SOFLHUI12A58A7DDD4,Keepin On,New York,,,
16749,TRDGJQT128F42632E3,Electronic,22337,test,ARJH2B61187B9B9465,http://artwork-cdn.7static.com/static/img/slee...,22337_TRDGJQT128F42632E3.png,TRDGJQT128F42632E3,2004,ARJH2B61187B9B9465,Funk D'Void,volume freak,SOFLOYC12A8C13730A,Endless,"Glasgow, Scotland",55.85780,-4.24251,GB
16750,TRRESAT128F422A1E6,Electronic,22347,test,ARON0ER1187FB52E2A,http://artwork-cdn.7static.com/static/img/slee...,22347_TRRESAT128F422A1E6.png,TRRESAT128F422A1E6,0,ARON0ER1187FB52E2A,Arrakis,The Spice,SOFLBCB12A6D4FC865,The Spice,FR,46.71067,1.71819,FR
16751,TRMFLQB12903CE56C2,Blues,22358,test,AR56P361187B9AC4DB,http://artwork-cdn.7static.com/static/img/slee...,22358_TRMFLQB12903CE56C2.png,TRMFLQB12903CE56C2,2001,AR56P361187B9AC4DB,Gary Moore,Back To The Blues,SOFLPHC12A58A76A02,Picture Of The Moon,"Belfast, Northern Ireland",54.59580,-5.93494,GB


In [None]:
# Add image name to a new column in dataframe
name = []
for index, row in df_unique.iterrows():
    track_name = row['msd_track_id']
    album_index = row['album_index']
    name.append( f"{album_index}_{track_name}.png")
df_unique['name_image'] = name


### Joining with MSD

In [84]:
df_joined = pd.merge(left=df_unique, right=concatenated_df, left_on='msd_track_id', right_on='track_id', how='left')


In [85]:
df_joined

Unnamed: 0,msd_track_id,genre,album_index,set,msd_artist_id,image_url,name_image,track_id,year,artist_id,artist_name,release,song_id,title,artist_location,artist_latitude,artist_longitude
0,TRABKJU128F422A7FE,Metal,0,train,ARBD4QW1187FB42153,http://artwork-cdn.7static.com/static/img/slee...,0_TRABKJU128F422A7FE.png,TRABKJU128F422A7FE,2006,ARBD4QW1187FB42153,Scar Symmetry,Nuclear Blast Showdown 2006,SODUYOC12A67ADC8ED,The Illusionist,Sweden,62.19845,17.55142
1,TRBLDQQ128F92E58B4,Rock,1,train,AR3RK011187FB3CE3B,http://artwork-cdn.7static.com/static/img/slee...,1_TRBLDQQ128F92E58B4.png,TRBLDQQ128F92E58B4,2007,AR3RK011187FB3CE3B,Eddie Vedder,Music For The Motion Picture Into The Wild,SODUDJI12A8C141513,Tuolumne,"Etats-Unis, Illinois, Evanston",42.05665,-87.68659
2,TRDMMDE128F14A9052,Rock,2,train,ARJVTRE1187B9959C0,http://artwork-cdn.7static.com/static/img/slee...,2_TRDMMDE128F14A9052.png,TRDMMDE128F14A9052,1981,ARJVTRE1187B9959C0,Foghat,Girls To Chat And Boys To Bounce,SODUUHR12A6D4FB01C,Weekend Driver,"London, England",40.71455,-74.00712
3,TRJOPZB128F4250E02,Rock,4,train,AR62BB21187B9AC83D,http://artwork-cdn.7static.com/static/img/slee...,4_TRJOPZB128F4250E02.png,TRJOPZB128F4250E02,2002,AR62BB21187B9AC83D,The Breeders,Title Tk,SODUEZB12A8C1348D5,Too Alive,Boston,42.31256,-71.08868
4,TRJKBVL128F935567B,Rock,5,train,AR7GVOV1187B9B5FF1,http://artwork-cdn.7static.com/static/img/slee...,5_TRJKBVL128F935567B.png,TRJKBVL128F935567B,1990,AR7GVOV1187B9B5FF1,Sebadoh,III,SODURCL12A81358260,Ride The Darker Wave,"Amherst, MA",42.37522,-72.51984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16748,TRKHBDC12903CD8949,Electronic,22332,test,ARGSRW21187B992282,http://artwork-cdn.7static.com/static/img/slee...,22332_TRKHBDC12903CD8949.png,TRKHBDC12903CD8949,0,ARGSRW21187B992282,Lazybatusu,Lazybatusu,SOFLHUI12A58A7DDD4,Keepin On,New York,,
16749,TRDGJQT128F42632E3,Electronic,22337,test,ARJH2B61187B9B9465,http://artwork-cdn.7static.com/static/img/slee...,22337_TRDGJQT128F42632E3.png,TRDGJQT128F42632E3,2004,ARJH2B61187B9B9465,Funk D'Void,volume freak,SOFLOYC12A8C13730A,Endless,"Glasgow, Scotland",55.85780,-4.24251
16750,TRRESAT128F422A1E6,Electronic,22347,test,ARON0ER1187FB52E2A,http://artwork-cdn.7static.com/static/img/slee...,22347_TRRESAT128F422A1E6.png,TRRESAT128F422A1E6,0,ARON0ER1187FB52E2A,Arrakis,The Spice,SOFLBCB12A6D4FC865,The Spice,FR,46.71067,1.71819
16751,TRMFLQB12903CE56C2,Blues,22358,test,AR56P361187B9AC4DB,http://artwork-cdn.7static.com/static/img/slee...,22358_TRMFLQB12903CE56C2.png,TRMFLQB12903CE56C2,2001,AR56P361187B9AC4DB,Gary Moore,Back To The Blues,SOFLPHC12A58A76A02,Picture Of The Moon,"Belfast, Northern Ireland",54.59580,-5.93494


In [94]:
df_joined['country'] = df_joined.apply(LocationProcessor.get_country_from_coordinates, axis=1)

In [95]:
df_joined['country_name'] = df_joined['country'].apply(LocationProcessor.get_country_name)

df_joined['artist_location_country'] = df_joined.apply(
    lambda row: LocationProcessor.get_artist_country_name(row['artist_location']) if pd.isna(row['country_name']) else row['country_name'],
    axis=1
)
df_joined['artist_location_country'] = df_joined['artist_location_country'].apply(LocationProcessor.get_country_post_processing)

Unnamed: 0,msd_track_id,genre,album_index,set,msd_artist_id,image_url,name_image,track_id,year,artist_id,artist_name,release,song_id,title,artist_location,artist_latitude,artist_longitude,country
0,TRABKJU128F422A7FE,Metal,0,train,ARBD4QW1187FB42153,http://artwork-cdn.7static.com/static/img/slee...,0_TRABKJU128F422A7FE.png,TRABKJU128F422A7FE,2006,ARBD4QW1187FB42153,Scar Symmetry,Nuclear Blast Showdown 2006,SODUYOC12A67ADC8ED,The Illusionist,Sweden,62.19845,17.55142,
1,TRBLDQQ128F92E58B4,Rock,1,train,AR3RK011187FB3CE3B,http://artwork-cdn.7static.com/static/img/slee...,1_TRBLDQQ128F92E58B4.png,TRBLDQQ128F92E58B4,2007,AR3RK011187FB3CE3B,Eddie Vedder,Music For The Motion Picture Into The Wild,SODUDJI12A8C141513,Tuolumne,"Etats-Unis, Illinois, Evanston",42.05665,-87.68659,
2,TRDMMDE128F14A9052,Rock,2,train,ARJVTRE1187B9959C0,http://artwork-cdn.7static.com/static/img/slee...,2_TRDMMDE128F14A9052.png,TRDMMDE128F14A9052,1981,ARJVTRE1187B9959C0,Foghat,Girls To Chat And Boys To Bounce,SODUUHR12A6D4FB01C,Weekend Driver,"London, England",40.71455,-74.00712,
3,TRJOPZB128F4250E02,Rock,4,train,AR62BB21187B9AC83D,http://artwork-cdn.7static.com/static/img/slee...,4_TRJOPZB128F4250E02.png,TRJOPZB128F4250E02,2002,AR62BB21187B9AC83D,The Breeders,Title Tk,SODUEZB12A8C1348D5,Too Alive,Boston,42.31256,-71.08868,
4,TRJKBVL128F935567B,Rock,5,train,AR7GVOV1187B9B5FF1,http://artwork-cdn.7static.com/static/img/slee...,5_TRJKBVL128F935567B.png,TRJKBVL128F935567B,1990,AR7GVOV1187B9B5FF1,Sebadoh,III,SODURCL12A81358260,Ride The Darker Wave,"Amherst, MA",42.37522,-72.51984,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16748,TRKHBDC12903CD8949,Electronic,22332,test,ARGSRW21187B992282,http://artwork-cdn.7static.com/static/img/slee...,22332_TRKHBDC12903CD8949.png,TRKHBDC12903CD8949,0,ARGSRW21187B992282,Lazybatusu,Lazybatusu,SOFLHUI12A58A7DDD4,Keepin On,New York,,,
16749,TRDGJQT128F42632E3,Electronic,22337,test,ARJH2B61187B9B9465,http://artwork-cdn.7static.com/static/img/slee...,22337_TRDGJQT128F42632E3.png,TRDGJQT128F42632E3,2004,ARJH2B61187B9B9465,Funk D'Void,volume freak,SOFLOYC12A8C13730A,Endless,"Glasgow, Scotland",55.85780,-4.24251,
16750,TRRESAT128F422A1E6,Electronic,22347,test,ARON0ER1187FB52E2A,http://artwork-cdn.7static.com/static/img/slee...,22347_TRRESAT128F422A1E6.png,TRRESAT128F422A1E6,0,ARON0ER1187FB52E2A,Arrakis,The Spice,SOFLBCB12A6D4FC865,The Spice,FR,46.71067,1.71819,
16751,TRMFLQB12903CE56C2,Blues,22358,test,AR56P361187B9AC4DB,http://artwork-cdn.7static.com/static/img/slee...,22358_TRMFLQB12903CE56C2.png,TRMFLQB12903CE56C2,2001,AR56P361187B9AC4DB,Gary Moore,Back To The Blues,SOFLPHC12A58A76A02,Picture Of The Moon,"Belfast, Northern Ireland",54.59580,-5.93494,


In [605]:
df_joined.to_csv(f"{input_msdi_path}/msdi_mapping_and_metadata.csv", index=False)

## Stats

In [8]:
df_joined = pd.read_csv(f'{input_msdi_path}/msdi_mapping_and_metadata.csv', header=0)

### Genre distribution
Genre distribution, the most common genre in this data set is Rock with 23.22% of the images, followed by Electronic (15.19%) and Pop (10.98%)

In [90]:
value_counts = df_joined['genre'].value_counts()
value_counts_df = value_counts.reset_index()
value_counts_df['Percentage'] = round((value_counts_df['count'] * 100) / 16753, 3)
value_counts_df

Unnamed: 0,genre,count,Percentage
0,Rock,3890,23.22
1,Electronic,2544,15.185
2,Pop,1840,10.983
3,Jazz,1341,8.005
4,Rap,1232,7.354
5,Metal,1140,6.805
6,RnB,970,5.79
7,Reggae,829,4.948
8,Country,802,4.787
9,Folk,541,3.229


We have 63 different release year and approximately for 10% of the data there's not release year

In [91]:
year_counts = df_joined['year'].value_counts()
year_counts_df = year_counts.reset_index()
year_counts_df['Percentage'] = round((year_counts_df['count'] * 100) / len(df_final), 3)
year_counts_df

Unnamed: 0,year,count,Percentage
0,0,3078,9.857
1,2007,1191,3.814
2,2006,1071,3.430
3,2005,1016,3.254
4,2008,965,3.090
...,...,...,...
59,1958,1,0.003
60,1929,1,0.003
61,1935,1,0.003
62,1956,1,0.003


There's a even distribution of artists there are 8858 different artists, the most repeated one is Celine Dion

In [92]:
artirst_counts = df_joined['artist_id'].value_counts()
artirst_counts_df = artirst_counts.reset_index()
artirst_counts_df['Percentage'] = round((artirst_counts_df['count'] * 100) / len(df_final), 3)
artirst_counts_df = pd.merge(left=artirst_counts_df, right=df_joined, left_on='artist_id', right_on='artist_id', how='left').drop_duplicates(subset='artist_id')
artirst_counts_df= artirst_counts_df[['artist_id','artist_name','count','Percentage']]
artirst_counts_df

Unnamed: 0,artist_id,artist_name,count,Percentage
0,ARFWL8S1187B9B4B44,Céline Dion,19,0.061
19,ARZZ5ZR1187FB4D149,Air,17,0.054
36,ARNF13I1187FB562A5,Massive Attack,17,0.054
53,AR0IU4L1187FB4CCB9,Atomic Kitten,15,0.048
68,AR9I1L41187FB37F0E,Harry Connick_ Jr.,15,0.048
...,...,...,...,...
16748,AREXUHA1187B9923BF,Empire State Human,1,0.003
16749,ARDY3451187B9A0226,Skeeter Davis,1,0.003
16750,ARFCK201187FB469B3,Dennis Coffey,1,0.003
16751,AR6WICD1187B998708,Christian McBride,1,0.003


We have the artist location of the album for almost 60% of the data, this is 10230 images

In [14]:
artist_location_counts = df_joined['artist_location_country'].value_counts()
artist_location_counts_df = artist_location_counts.reset_index()
artist_location_counts_df['Percentage'] = round((artist_location_counts_df['count'] * 100) / len(df_joined), 3)
artist_location_counts_df

Unnamed: 0,artist_location_country,count,Percentage
0,,6523,38.936
1,United States,4643,27.714
2,United Kingdom,1669,9.962
3,Canada,719,4.292
4,Sweden,259,1.546
...,...,...,...
119,Uganda,1,0.006
120,Hong Kong,1,0.006
121,Burundi,1,0.006
122,Côte d'Ivoire,1,0.006


### Images stats
The size of the images are 200x200

In [37]:
folder_path = f"{input_msdi_path}/images_msdi"
unique_dimensions = set()
resolutions = []

for filename in os.listdir(folder_path):

    file_path = os.path.join(folder_path, filename)

    if file_path.lower().endswith(('.png')):
        try:
            # Open the image and get its dimensions
            with Image.open(file_path) as img:
                width, height = img.size
                # Add the dimensions to the set
                unique_dimensions.add((width, height))
                dpi = img.info.get('dpi', None)
                if dpi:
                    resolutions.append(dpi)
        except Exception as e:
            print(f"Error processing {filename}: {e}")



In [253]:
tuple_counts = Counter(resolutions)
print(f"Number of images with available defined {len(resolutions)}")
print(f"Dimension of images {unique_dimensions}")


Number of images with resolution defined 14287
Dimension of images {(200, 200)}


In [254]:
resolutions_data = [(k[0], k[1], v) for k, v in tuple_counts.items()]
resolution_df = pd.DataFrame(resolutions_data, columns=['x', 'y', 'count'])
resolution_df = resolution_df.sort_values('count', ascending=False)
resolution_df['Percentage'] = round((resolution_df['count'] * 100) / len(resolutions), 3)
resolution_df

Unnamed: 0,x,y,count,Percentage
0,300,300,8030,56.205
1,72,72,4174,29.215
2,96,96,1868,13.075
5,200,200,64,0.448
7,100,100,54,0.378
9,150,150,18,0.126
17,600,600,11,0.077
4,762,762,10,0.07
15,400,400,8,0.056
18,6,6,8,0.056
