In [None]:
import pandas as pd
import h5py
import numpy as np
import glob
import os
import matplotlib.pyplot as plt

In [None]:
dataset_path = glob.glob('/opt/workspace/data/MillionSongSubset/*/*/*/*.h5')
len(dataset_path)

In [None]:
#The total size of files in the given dataset path
total_size = 0
for file in dataset_path:
    if os.path.isfile(file):
        total_size += os.path.getsize(file)
print("Total size of files: {:.2f} GB".format(total_size/(1024*1024*1024)))

In [None]:
#Reading and Parsing Metadata Information from MillionSongSubset Dataset
dataset_path = '/opt/workspace/data/MillionSongSubset/*/*/*/*.h5'
# creating an empty list to hold metadata information from all files
metadata_rows = []
# loop through each file in the dataset path
for file_path in glob.glob(dataset_path):
    with h5py.File(file_path, 'r') as f:
        metadata = f['/metadata/songs']
        column_names = list(metadata.dtype.names)
        metadata_dict = {}
        for column_name in column_names:
            metadata_dict[column_name] = metadata[column_name][0]
        metadata_rows.append(metadata_dict)
metadata_df = pd.DataFrame(metadata_rows)
metadata_df.head()

In [None]:
# sort the DataFrame by hotttnesss in descending order and print the top 10 rows
df = df.sort_values(by=['Song Hotttnesss'], ascending=False)
df.head(10)

In [None]:
"""
Types of the genre:
b: Blues
c: Classical
e: Electronic
f: Electronic
j: Jazz
m: Metal
p: Pop
r: Hip-hop/Rap
s: Soul/R&B
r: Rock
"""
print(metadata_df['genre'])
unique_genres = metadata_df['genre'].unique()
print(unique_genres)

In [None]:
# Define the path to the dataset
dataset_path = '/opt/workspace/data/MillionSongSubset/*/*/*/*.h5'

# Create an empty list to store rows of data
table_rows = []

# Loop through each file in the dataset
for file_path in glob.glob(dataset_path):
    # Extract song_hotttnesss, year, song_id, genre, and title
    with h5py.File(file_path, 'r') as f:
        song_hotttnesss = f['/metadata/songs']['song_hotttnesss'][:]
        song_id = f['/metadata/songs']['song_id'][:]
        genre = f['/metadata/songs']['genre'][:]
        title = f['/metadata/songs']['title'][:]
        year = f['/musicbrainz/songs']['year'][:]
        
        # Loop through each song and add its information to the list of rows
        for i in range(len(song_hotttnesss)):
            # Check if song_hotttnesss and year exist for the current song
            if not np.isnan(song_hotttnesss[i]) and year[i]:
                # If year is 0, replace with "N/A"
                if year[i] == 0:
                    year_str = "N/A"
                else:
                    year_str = str(year[i])
                table_rows.append([song_id[i], title[i].decode("utf-8"), song_hotttnesss[i], year_str, genre[i]])
                
# Sort the list of rows by song_hotttnesss in descending order
table_rows.sort(key=lambda x: x[2], reverse=True)

# Convert the list of rows to a pandas DataFrame
df = pd.DataFrame(table_rows, columns=['Song ID', 'Song Name', 'Song Hotttnesss', 'Year', 'Genre'])

# Convert the 'Year' column to numeric type
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Create a bar chart of the maximum song_hotttnesss by year
fig, ax = plt.subplots(figsize=(10, 5))
max_hotttnesss = df.groupby('Year')['Song Hotttnesss'].max()
max_hotttnesss.plot(kind='bar', ax=ax)
ax.set_xlabel('Year')
ax.set_ylabel('Song Hotttnesss')
ax.set_ylim([0, 1])
plt.show()


In [None]:
df = df.sort_values(by=['Song Hotttnesss'], ascending=False)
df.head(10)

In [None]:
import h5py
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# set path to MillionSongSubset dataset
dataset_path = '/opt/workspace/data/MillionSongSubset/*/*/*/*.h5'

# extract song metadata and year information
table_rows = []
for file_path in glob.glob(dataset_path):
    with h5py.File(file_path, 'r') as f:
        song_hotttnesss = f['/metadata/songs']['song_hotttnesss'][:]
        song_id = f['/metadata/songs']['song_id'][:]
        genre = f['/metadata/songs']['genre'][:]
        year = f['/musicbrainz/songs']['year'][:]
        
        for i in range(len(song_hotttnesss)):
            if not np.isnan(song_hotttnesss[i]) and year[i]:
                if year[i] == 0:
                    year_str = "N/A"
                else:
                    year_str = str(year[i])
                table_rows.append([song_id[i], song_hotttnesss[i], year_str, genre[i]])
                
# sort songs by hotttnesss in descending order
table_rows.sort(key=lambda x: x[1], reverse=True)

# create dataframe from song metadata and year information
df = pd.DataFrame(table_rows, columns=['Song ID', 'Song Hotttnesss', 'Year', 'Genre'])
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# count the number of popular songs by year
popularity_by_year = df[df['Song Hotttnesss'] >= 0.5].groupby('Year').size()

# create bar chart of the number of popular songs by year
fig, ax = plt.subplots(figsize=(10, 5))
popularity_by_year.plot(kind='bar', ax=ax)
ax.set_xlabel('Year')
ax.set_ylabel('Number of Popular Songs')
ax.set_title('Trend in Popularity of Songs over Time')
plt.show()
