# Million Song Dataset - Data Loading

This notebook loads and explores:
1. **Taste Profile Subset** (train_triplets.txt): User listening data
2. **Million Song Subset**: Song metadata from HDF5 files

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import h5py
import os
from pathlib import Path
from tqdm import tqdm

## 0. Data Preview

Inspect the structure and column names of the input files (`train_triplets.txt` and HDF5 song files).

In [8]:
# Preview local file structures and column names
print("=== PREVIEW: train_triplets.txt ===")
try:
    # Read first line to verify format
    with open('../train_triplets.txt', 'r') as f:
        first_line = f.readline().strip()
    print(f"First line raw: {first_line}")
    print("Structure: No header. Columns: ['user_id', 'song_id', 'play_count']")
except Exception as e:
    print(f"Could not read train_triplets.txt: {e}")

print("\n=== PREVIEW: MillionSongSubset (HDF5) ===")
try:
    # Find a sample HDF5 file
    h5_files = list(Path('../MillionSongSubset').rglob('*.h5'))
    if h5_files:
        sample_h5 = h5_files[0]
        print(f"Inspecting sample: {sample_h5.name}")
        
        with h5py.File(sample_h5, 'r') as f:
            def print_details(name, obj):
                if isinstance(obj, h5py.Dataset):
                    print(f"\n[Dataset] {name}")
                    if obj.dtype.names:
                        print(f"  Columns: {list(obj.dtype.names)}")
                    else:
                        print(f"  Shape: {obj.shape}")
                        
            f.visititems(print_details)
    else:
        print("No .h5 files found in ../MillionSongSubset/")
except Exception as e:
    print(f"Error reading HDF5: {e}")

=== PREVIEW: train_triplets.txt ===
First line raw: b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOAKIMP12A8C130995	1
Structure: No header. Columns: ['user_id', 'song_id', 'play_count']

=== PREVIEW: MillionSongSubset (HDF5) ===
Inspecting sample: TRAGGSJ128F42A5AE1.h5

[Dataset] analysis/bars_confidence
  Shape: (57,)

[Dataset] analysis/bars_start
  Shape: (57,)

[Dataset] analysis/beats_confidence
  Shape: (240,)

[Dataset] analysis/beats_start
  Shape: (240,)

[Dataset] analysis/sections_confidence
  Shape: (7,)

[Dataset] analysis/sections_start
  Shape: (7,)

[Dataset] analysis/segments_confidence
  Shape: (383,)

[Dataset] analysis/segments_loudness_max
  Shape: (383,)

[Dataset] analysis/segments_loudness_max_time
  Shape: (383,)

[Dataset] analysis/segments_loudness_start
  Shape: (383,)

[Dataset] analysis/segments_pitches
  Shape: (383, 12)

[Dataset] analysis/segments_start
  Shape: (383,)

[Dataset] analysis/segments_timbre
  Shape: (383, 12)

[Dataset] analysis/songs
  Column

## 1. Load Data

Loads preprocessed data if available, otherwise processes from original sources.

In [10]:
# Check for preprocessed data
taste_profile_file = '../data/taste_profile.pkl'
songs_metadata_file = '../data/songs_metadata.pkl'
force_reload = True  # Set to True to regenerate dataset with all columns

if os.path.exists(taste_profile_file) and os.path.exists(songs_metadata_file) and not force_reload:
    # Load from file
    taste_profile = pd.read_pickle(taste_profile_file)
    songs_metadata = pd.read_pickle(songs_metadata_file)
    print(f"Loaded {len(taste_profile):,} records and {len(songs_metadata):,} songs from cache")
    
else:
    # Load from original sources
    print("Processing original data sources...")
    
    # Load taste profile
    taste_profile = pd.read_csv(
        '../train_triplets.txt', 
        sep='\t', 
        header=None, 
        names=['user_id', 'song_id', 'play_count']
    )
    print(f"Loaded {len(taste_profile):,} listening records")
    
    # Load song metadata
    def get_song_data_from_h5(file_path):
        try:
            with h5py.File(file_path, 'r') as h5:
                data = {}
                # Groups containing structured song data to extract
                groups_to_extract = ['metadata/songs', 'analysis/songs', 'musicbrainz/songs']
                
                for group_path in groups_to_extract:
                    if group_path in h5:
                        dataset = h5[group_path]
                        # Iterate over all columns in the dataset
                        for col_name in dataset.dtype.names:
                            val = dataset[col_name][0]
                            # Decode bytes to utf-8 string if necessary
                            if isinstance(val, bytes):
                                val = val.decode('utf-8')
                            data[col_name] = val
                return data
        except Exception as e:
            return None
    
    h5_files = list(Path('../MillionSongSubset').rglob('*.h5'))
    song_data_list = [data for h5_file in tqdm(h5_files, desc="Loading songs") 
                      if (data := get_song_data_from_h5(h5_file))]
    songs_metadata = pd.DataFrame(song_data_list)
    print(f"Loaded {len(songs_metadata):,} song metadata records")
    
    # Cache for future use
    os.makedirs('../data', exist_ok=True)
    taste_profile.to_pickle(taste_profile_file)
    songs_metadata.to_pickle(songs_metadata_file)
    print("Data cached successfully")

# Display samples
display(taste_profile.head())
display(songs_metadata.head())

Processing original data sources...
Loaded 48,373,586 listening records


Loading songs: 100%|██████████| 10000/10000 [00:55<00:00, 180.25it/s]


Loaded 10,000 song metadata records
Data cached successfully


Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


Unnamed: 0,analyzer_version,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,artist_longitude,artist_mbid,artist_name,...,loudness,mode,mode_confidence,start_of_fade_out,tempo,time_signature,time_signature_confidence,track_id,idx_artist_mbtags,year
0,,5020,0.688639,0.47878,ARISHTQ1187FB3E6F0,,,,837555ba-012e-45f1-9a9c-9628da13ee54,Ben E. King,...,-8.89,1,0.596,159.997,86.819,4,0.476,TRAGGSJ128F42A5AE1,0,0
1,,532781,0.444836,0.318726,ARTOCWH11F50C4CE12,,,,,The Accursed,...,-7.075,1,0.311,168.6,128.928,1,0.0,TRAGGEQ12903CC25DB,0,0
2,,434953,0.522424,0.417592,ARJJRZ41187FB4C856,,,,6dd5961a-16e7-4f03-8385-b251e12f9698,Floor,...,-7.033,1,0.298,202.368,160.598,4,0.0,TRAGGMY12903CD9533,0,2009
3,,912,0.571682,0.540008,AR3FE581187FB3F90A,,"Brooklyn, NY",,d2299099-e617-46dd-b013-ba0b4c72bc63,Fabolous,...,-5.189,1,0.08,212.869,200.435,4,1.0,TRAGGKN12903CCA8EE,0,0
4,,740906,0.540038,0.360053,AR1N6N61187FB4E25E,23.0833,"Havana, Cuba [Jesus Maria]",-82.4167,915f84de-eac2-4e78-99ad-9ab6e2c50af6,Mongo Santamaria,...,-8.674,0,0.696,182.503,118.295,1,0.0,TRAGGXK12903CDA84B,0,1975


## 2. Dataset Overview

In [3]:
# Dataset statistics
print("TASTE PROFILE")
print(f"  Records: {len(taste_profile):,}")
print(f"  Users: {taste_profile['user_id'].nunique():,}")
print(f"  Songs: {taste_profile['song_id'].nunique():,}")
print(f"  Avg plays: {taste_profile['play_count'].mean():.1f}")

print("\nSONG METADATA")
print(f"  Songs: {len(songs_metadata):,}")
print(f"  Artists: {songs_metadata['artist_name'].nunique():,}")
print(f"  Year range: {songs_metadata['year'].min()}-{songs_metadata['year'].max()}")
print(f"  Avg duration: {songs_metadata['duration'].mean():.1f}s")

# Data overlap
overlap = set(taste_profile['song_id'].unique()) & set(songs_metadata['song_id'].unique())
coverage = len(overlap) / taste_profile['song_id'].nunique() * 100
print(f"\nOVERLAP")
print(f"  Common songs: {len(overlap):,} ({coverage:.1f}%)")

TASTE PROFILE
  Records: 48,373,586
  Users: 1,019,318
  Songs: 384,546
  Avg plays: 2.9

SONG METADATA
  Songs: 10,000
  Artists: 4,412
  Year range: 0-2010
  Avg duration: 238.5s

OVERLAP
  Common songs: 3,675 (1.0%)


## 3. Join Datasets

Merge taste profile with song metadata on song_id.

In [4]:
# Join datasets on song_id
merged_data = taste_profile.merge(
    songs_metadata, 
    on='song_id', 
    how='inner'
)

print(f"Joined dataset: {len(merged_data):,} records")
print(f"Columns: {merged_data.shape[1]}")
print(f"\nMemory usage: {merged_data.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Sample
display(merged_data.head())

Joined dataset: 772,661 records
Columns: 18

Memory usage: 384.3 MB


Unnamed: 0,user_id,song_id,play_count,title,artist_name,artist_id,release,year,duration,tempo,loudness,key,mode,time_signature,energy,danceability,artist_hotttnesss,song_hotttnesss
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOWEZSI12A81C21CE6,1,Tu Quieres Volver,Gipsy Kings,AR2UQQ51187B9AC816,Greatest Hits,1987,194.87302,165.006,-8.403,5,0,1,0.0,0.0,0.540631,0.778821
1,4bd88bfb25263a75bbdd467e74018f4ae570e5df,SODCXXY12AB0187452,2,Freaxxx,brokeNCYDE,ARXLMH011C8A415658,BC 13-EP,2008,214.9873,119.612,-11.588,1,1,4,0.0,0.0,0.434137,0.682113
2,4bd88bfb25263a75bbdd467e74018f4ae570e5df,SOWPAXV12A67ADA046,18,Push It,Salt-N-Pepa,ARUQ6301187FB54EBA,Ultimate Girl Groups,1988,207.62077,127.597,-5.668,4,1,4,0.0,0.0,0.519288,0.80612
3,b64cdd1a0bd907e5e00b39e345194768e330d652,SOLXDDC12A6701FBFD,1,I'm Back,Eminem,ARTH9041187FB43E1F,The Marshall Mathers LP,2000,312.2673,180.168,-3.989,11,0,4,0.0,0.0,0.80483,0.811708
4,b64cdd1a0bd907e5e00b39e345194768e330d652,SONJBQX12A6D4F8382,4,Da Funk,Daft Punk,ARF8HTQ1187B9AE693,Homework,1995,329.53424,111.201,-5.019,5,0,4,0.0,0.0,1.021256,0.862255


## 4. Save Datasets

Save all datasets in multiple formats.

In [5]:
# Create data directory
os.makedirs('../data', exist_ok=True)

# Save individual datasets
taste_profile.to_pickle('../data/taste_profile.pkl')
print(f"Saved: taste_profile ({len(taste_profile):,} rows)")

songs_metadata.to_pickle('../data/songs_metadata.pkl')
print(f"Saved: songs_metadata ({len(songs_metadata):,} rows)")

merged_data.to_pickle('../data/merged_data.pkl')
print(f"Saved: merged_data ({len(merged_data):,} rows)")

print("\nAll data saved successfully")

Saved: taste_profile (48,373,586 rows)
Saved: songs_metadata (10,000 rows)
Saved: merged_data (772,661 rows)

All data saved successfully
