# Million Song Dataset - Data Loading

This notebook loads and explores:
1. **Taste Profile Subset** (train_triplets.txt): User listening data
2. **Million Song Subset**: Song metadata from HDF5 files

## Import Libraries

In [2]:
import pandas as pd
import numpy as np
import h5py
import os
from pathlib import Path
from tqdm import tqdm

## 0. Data Preview

Inspect the structure and column names of the input files (`train_triplets.txt` and HDF5 song files).

In [3]:
# Preview local file structures and column names
print("=== PREVIEW: train_triplets.txt ===")
try:
    # Read first line to verify format
    with open('../train_triplets.txt', 'r') as f:
        first_line = f.readline().strip()
    print(f"First line raw: {first_line}")
    print("Structure: No header. Columns: ['user_id', 'song_id', 'play_count']")
except Exception as e:
    print(f"Could not read train_triplets.txt: {e}")

print("\n=== PREVIEW: MillionSongSubset (HDF5) ===")
try:
    # Find a sample HDF5 file
    h5_files = list(Path('../MillionSongSubset').rglob('*.h5'))
    if h5_files:
        sample_h5 = h5_files[0]
        print(f"Inspecting sample: {sample_h5.name}")
        
        with h5py.File(sample_h5, 'r') as f:
            def print_details(name, obj):
                if isinstance(obj, h5py.Dataset):
                    print(f"\n[Dataset] {name}")
                    if obj.dtype.names:
                        print(f"  Columns: {list(obj.dtype.names)}")
                    else:
                        print(f"  Shape: {obj.shape}")
                        
            f.visititems(print_details)
    else:
        print("No .h5 files found in ../MillionSongSubset/")
except Exception as e:
    print(f"Error reading HDF5: {e}")

=== PREVIEW: train_triplets.txt ===
First line raw: b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOAKIMP12A8C130995	1
Structure: No header. Columns: ['user_id', 'song_id', 'play_count']

=== PREVIEW: MillionSongSubset (HDF5) ===
Inspecting sample: TRBBGOG128EF33EDCA.h5

[Dataset] analysis/bars_confidence
  Shape: (175,)

[Dataset] analysis/bars_start
  Shape: (175,)

[Dataset] analysis/beats_confidence
  Shape: (704,)

[Dataset] analysis/beats_start
  Shape: (704,)

[Dataset] analysis/sections_confidence
  Shape: (6,)

[Dataset] analysis/sections_start
  Shape: (6,)

[Dataset] analysis/segments_confidence
  Shape: (886,)

[Dataset] analysis/segments_loudness_max
  Shape: (886,)

[Dataset] analysis/segments_loudness_max_time
  Shape: (886,)

[Dataset] analysis/segments_loudness_start
  Shape: (886,)

[Dataset] analysis/segments_pitches
  Shape: (886, 12)

[Dataset] analysis/segments_start
  Shape: (886,)

[Dataset] analysis/segments_timbre
  Shape: (886, 12)

[Dataset] analysis/songs
  Colu

## 1. Load Data

Loads preprocessed data if available, otherwise processes from original sources.

In [4]:
# Check for preprocessed data
taste_profile_file = '../data/taste_profile.pkl'
songs_metadata_file = '../data/songs_metadata.pkl'
force_reload = True  # Set to True to regenerate dataset with all columns

if os.path.exists(taste_profile_file) and os.path.exists(songs_metadata_file) and not force_reload:
    # Load from file
    taste_profile = pd.read_pickle(taste_profile_file)
    songs_metadata = pd.read_pickle(songs_metadata_file)
    print(f"Loaded {len(taste_profile):,} records and {len(songs_metadata):,} songs from cache")
    
else:
    # Load from original sources
    print("Processing original data sources...")
    
    # Load taste profile
    taste_profile = pd.read_csv(
        '../train_triplets.txt', 
        sep='\t', 
        header=None, 
        names=['user_id', 'song_id', 'play_count']
    )
    print(f"Loaded {len(taste_profile):,} listening records")
    
    # Load song metadata
    def get_song_data_from_h5(file_path):
        try:
            with h5py.File(file_path, 'r') as h5:
                data = {}
                # Groups containing structured song data to extract
                groups_to_extract = ['metadata/songs', 'analysis/songs', 'musicbrainz/songs']
                
                for group_path in groups_to_extract:
                    if group_path in h5:
                        dataset = h5[group_path]
                        # Iterate over all columns in the dataset
                        for col_name in dataset.dtype.names:
                            val = dataset[col_name][0]
                            # Decode bytes to utf-8 string if necessary
                            if isinstance(val, bytes):
                                val = val.decode('utf-8')
                            data[col_name] = val
                
                # Extract artist terms (tags) if available
                if 'metadata/artist_terms' in h5:
                    terms_ds = h5['metadata/artist_terms']
                    # Get top 5 terms
                    terms = terms_ds[:5]
                    # Decode from bytes
                    decoded_terms = [t.decode('utf-8') for t in terms if isinstance(t, bytes)]
                    data['artist_terms'] = decoded_terms
                    
                return data
        except Exception as e:
            return None
    
    h5_files = list(Path('../MillionSongSubset').rglob('*.h5'))
    song_data_list = [data for h5_file in tqdm(h5_files, desc="Loading songs") 
                      if (data := get_song_data_from_h5(h5_file))]
    songs_metadata = pd.DataFrame(song_data_list)
    print(f"Loaded {len(songs_metadata):,} song metadata records")
    
    # Cache for future use
    os.makedirs('../data', exist_ok=True)
    taste_profile.to_pickle(taste_profile_file)
    songs_metadata.to_pickle(songs_metadata_file)
    print("Data cached successfully")

# Display samples
display(taste_profile.head())
display(songs_metadata.head())

Processing original data sources...
Loaded 48,373,586 listening records


Loading songs: 100%|██████████| 10000/10000 [01:03<00:00, 158.66it/s]


Loaded 10,000 song metadata records
Data cached successfully


Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


Unnamed: 0,analyzer_version,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,artist_longitude,artist_mbid,artist_name,...,mode,mode_confidence,start_of_fade_out,tempo,time_signature,time_signature_confidence,track_id,idx_artist_mbtags,year,artist_terms
0,,51166,0.523704,0.354307,ARMDE3V1187FB3EEFB,,,,2ff26748-4e9c-4ac5-861e-997ed82efb7a,The Last Days Of Jesus,...,1,0.188,271.935,145.185,4,0.118,TRBBGOG128EF33EDCA,0,2004,"[deathrock, new wave, dark wave, gothic rock, ..."
1,,135117,0.561371,0.365534,ARN3O411187FB4D859,,,,a04ceabd-1f20-4d29-b579-24810ac2da88,Stephan Micus,...,1,0.508,296.89,85.816,1,1.0,TRBBGMD128F4229F92,0,2002,"[free improvisation, jazz, world music, folk, ..."
2,,4411,0.641198,0.448653,AR4TLW81187B99683D,,"Syracuse, NY",,0685ac4a-5cfc-408a-b391-903ea20e00bf,Martin Sexton,...,1,0.851,233.128,86.805,1,0.307,TRBBGDK128F427D450,0,1998,"[blue-eyed soul, folk-pop, folk rock, singer-s..."
3,,2259,0.66267,0.37897,AR0V36F1187FB366B1,53.41961,Ireland,-8.24055,fce5eb9a-95a7-4a54-b9e4-fe1b036fbe57,The Chieftains,...,1,0.737,226.656,105.539,7,0.977,TRBBGFR128F427B5BD,0,1977,"[irish folk, celtic, celtic fusion, folk rock,..."
4,,304,0.899935,0.604667,ARH6W4X1187B99274F,,"Oxford, UK",,a74b1b7f-71a5-4011-9441-d0b5e4122711,Radiohead,...,0,0.303,194.206,111.629,3,1.0,TRBBGQN128F9310CBE,0,2009,"[british pop, experimental rock, rock, england..."


## 2. Dataset Overview

In [5]:
# Dataset statistics
print("TASTE PROFILE")
print(f"  Records: {len(taste_profile):,}")
print(f"  Users: {taste_profile['user_id'].nunique():,}")
print(f"  Songs: {taste_profile['song_id'].nunique():,}")
print(f"  Avg plays: {taste_profile['play_count'].mean():.1f}")

print("\nSONG METADATA")
print(f"  Songs: {len(songs_metadata):,}")
print(f"  Artists: {songs_metadata['artist_name'].nunique():,}")
print(f"  Year range: {songs_metadata['year'].min()}-{songs_metadata['year'].max()}")
print(f"  Avg duration: {songs_metadata['duration'].mean():.1f}s")

# Data overlap
overlap = set(taste_profile['song_id'].unique()) & set(songs_metadata['song_id'].unique())
coverage = len(overlap) / taste_profile['song_id'].nunique() * 100
print(f"\nOVERLAP")
print(f"  Common songs: {len(overlap):,} ({coverage:.1f}%)")

TASTE PROFILE
  Records: 48,373,586
  Users: 1,019,318
  Songs: 384,546
  Avg plays: 2.9

SONG METADATA
  Songs: 10,000
  Artists: 4,412
  Year range: 0-2010
  Avg duration: 238.5s

OVERLAP
  Common songs: 3,675 (1.0%)


## 3. Join Datasets

Merge taste profile with song metadata on song_id.

In [6]:
# Join datasets on song_id
merged_data = taste_profile.merge(
    songs_metadata, 
    on='song_id', 
    how='inner'
)

print(f"Joined dataset: {len(merged_data):,} records")
print(f"Columns: {merged_data.shape[1]}")
print(f"\nMemory usage: {merged_data.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Sample
display(merged_data.head())

Joined dataset: 772,661 records
Columns: 56

Memory usage: 869.9 MB


Unnamed: 0,user_id,song_id,play_count,analyzer_version,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,...,mode,mode_confidence,start_of_fade_out,tempo,time_signature,time_signature_confidence,track_id,idx_artist_mbtags,year,artist_terms
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOWEZSI12A81C21CE6,1,,30031,0.710446,0.540631,AR2UQQ51187B9AC816,,"Arles, France",...,0,0.318,188.285,165.006,1,0.034,TRAUCNU128F42671EB,0,1987,"[flamenco, soundtrack, folk, spanish, acoustic]"
1,4bd88bfb25263a75bbdd467e74018f4ae570e5df,SODCXXY12AB0187452,2,,413978,0.867951,0.434137,ARXLMH011C8A415658,,"ALBUCRAZY, NEW MEXICO",...,1,0.5,207.337,119.612,4,0.432,TRBFXMJ12903CB50F6,0,2008,"[pop rap, crunk, rapcore, screamo, breakcore]"
2,4bd88bfb25263a75bbdd467e74018f4ae570e5df,SOWPAXV12A67ADA046,18,,34988,0.692345,0.519288,ARUQ6301187FB54EBA,,,...,1,0.359,207.621,127.597,4,0.822,TRBBMHE128EF341D09,0,1988,"[pop rap, hip hop, hip house, new jack swing, ..."
3,b64cdd1a0bd907e5e00b39e345194768e330d652,SOLXDDC12A6701FBFD,1,,262,0.871011,0.80483,ARTH9041187FB43E1F,,"St. Joseph, MO",...,0,0.315,288.502,180.168,4,1.0,TRAUURC128E078EC6E,0,2000,"[hip hop, rap, hardcore rap, club, soundtrack]"
4,b64cdd1a0bd907e5e00b39e345194768e330d652,SONJBQX12A6D4F8382,4,,466,0.902841,1.021256,ARF8HTQ1187B9AE693,,"Paris, France",...,0,0.419,311.025,111.201,4,1.0,TRAQVTO128F14696A6,0,1995,"[techno, electronica, electronic, pop, french]"


## 4. Save Datasets

Save all datasets in multiple formats.

In [7]:
# Create data directory
os.makedirs('../data', exist_ok=True)

# Save individual datasets
taste_profile.to_pickle('../data/taste_profile.pkl')
print(f"Saved: taste_profile ({len(taste_profile):,} rows)")

songs_metadata.to_pickle('../data/songs_metadata.pkl')
print(f"Saved: songs_metadata ({len(songs_metadata):,} rows)")

merged_data.to_pickle('../data/merged_data.pkl')
print(f"Saved: merged_data ({len(merged_data):,} rows)")

print("\nAll data saved successfully")

Saved: taste_profile (48,373,586 rows)
Saved: songs_metadata (10,000 rows)
Saved: merged_data (772,661 rows)

All data saved successfully
