# Million Song Dataset - Data Loading

This notebook loads and explores:
1. **Taste Profile Subset** (train_triplets.txt): User listening data
2. **Million Song Subset**: Song metadata from HDF5 files

## Import Libraries

In [6]:
import pandas as pd
import numpy as np
import h5py
import os
from pathlib import Path
from tqdm import tqdm

## 1. Load Data

Loads preprocessed data if available, otherwise processes from original sources.

In [7]:
# Check for preprocessed data
taste_profile_file = '../data/taste_profile.pkl'
songs_metadata_file = '../data/songs_metadata.pkl'

if os.path.exists(taste_profile_file) and os.path.exists(songs_metadata_file):
    # Load from file
    taste_profile = pd.read_pickle(taste_profile_file)
    songs_metadata = pd.read_pickle(songs_metadata_file)
    print(f"Loaded {len(taste_profile):,} records and {len(songs_metadata):,} songs from cache")
    
else:
    # Load from original sources
    print("Processing original data sources...")
    
    # Load taste profile
    taste_profile = pd.read_csv(
        '../train_triplets.txt', 
        sep='\t', 
        header=None, 
        names=['user_id', 'song_id', 'play_count']
    )
    print(f"Loaded {len(taste_profile):,} listening records")
    
    # Load song metadata
    def get_song_data_from_h5(file_path):
        try:
            with h5py.File(file_path, 'r') as h5:
                return {
                    'song_id': h5['metadata']['songs']['song_id'][0].decode('utf-8'),
                    'title': h5['metadata']['songs']['title'][0].decode('utf-8'),
                    'artist_name': h5['metadata']['songs']['artist_name'][0].decode('utf-8'),
                    'artist_id': h5['metadata']['songs']['artist_id'][0].decode('utf-8'),
                    'release': h5['metadata']['songs']['release'][0].decode('utf-8'),
                    'year': int(h5['musicbrainz']['songs']['year'][0]),
                    'duration': float(h5['analysis']['songs']['duration'][0]),
                    'tempo': float(h5['analysis']['songs']['tempo'][0]),
                    'loudness': float(h5['analysis']['songs']['loudness'][0]),
                    'key': int(h5['analysis']['songs']['key'][0]),
                    'mode': int(h5['analysis']['songs']['mode'][0]),
                    'time_signature': int(h5['analysis']['songs']['time_signature'][0]),
                    'energy': float(h5['analysis']['songs']['energy'][0]),
                    'danceability': float(h5['analysis']['songs']['danceability'][0]),
                    'artist_hotttnesss': float(h5['metadata']['songs']['artist_hotttnesss'][0]),
                    'song_hotttnesss': float(h5['metadata']['songs']['song_hotttnesss'][0])
                }
        except Exception as e:
            return None
    
    h5_files = list(Path('../MillionSongSubset').rglob('*.h5'))
    song_data_list = [data for h5_file in tqdm(h5_files, desc="Loading songs") 
                      if (data := get_song_data_from_h5(h5_file))]
    songs_metadata = pd.DataFrame(song_data_list)
    print(f"Loaded {len(songs_metadata):,} song metadata records")
    
    # Cache for future use
    os.makedirs('../data', exist_ok=True)
    taste_profile.to_pickle(taste_profile_file)
    songs_metadata.to_pickle(songs_metadata_file)
    print("Data cached successfully")

# Display samples
display(taste_profile.head())
display(songs_metadata.head())

Loaded 48,373,586 records and 10,000 songs from cache


Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


Unnamed: 0,song_id,title,artist_name,artist_id,release,year,duration,tempo,loudness,key,mode,time_signature,energy,danceability,artist_hotttnesss,song_hotttnesss
0,SOIDSSZ12A8C142A76,Ride On Time (Original Version),Black Box,AROWNZP1187FB3C028,NOW Dance Anthems,0,272.14322,118.834,-9.159,9,0,4,0.0,0.0,0.361992,0.212045
1,SOFRVYQ12A8C137BAC,Ombre Et Lumière,Vincent Bruley,ARQXGWX11F50C49BC7,Le Temps Suspendu,0,498.38975,150.867,-14.037,4,0,4,0.0,0.0,0.30585,
2,SOGQIEK12AB0186792,That Girl,Gary Morris,AR0NQD81187FB3AD13,THAT GIRL,0,305.10975,130.72,-9.636,10,0,4,0.0,0.0,0.286033,
3,SOUEDBC12AC90972E5,Warung Beach,John Digweed,AROZNFA1187B99D367,Warung Beach,2006,405.10649,127.995,-5.988,10,1,4,0.0,0.0,0.433928,
4,SOUZOPT12A58A79B94,Bad Reputation (Originally Performed by Thin L...,The Meatmen,AR00A6H1187FB5402A,Cover the Earth,0,170.4224,130.164,-5.681,1,1,5,0.0,0.0,0.395628,0.0


## 2. Dataset Overview

In [8]:
# Dataset statistics
print("TASTE PROFILE")
print(f"  Records: {len(taste_profile):,}")
print(f"  Users: {taste_profile['user_id'].nunique():,}")
print(f"  Songs: {taste_profile['song_id'].nunique():,}")
print(f"  Avg plays: {taste_profile['play_count'].mean():.1f}")

print("\nSONG METADATA")
print(f"  Songs: {len(songs_metadata):,}")
print(f"  Artists: {songs_metadata['artist_name'].nunique():,}")
print(f"  Year range: {songs_metadata['year'].min()}-{songs_metadata['year'].max()}")
print(f"  Avg duration: {songs_metadata['duration'].mean():.1f}s")

# Data overlap
overlap = set(taste_profile['song_id'].unique()) & set(songs_metadata['song_id'].unique())
coverage = len(overlap) / taste_profile['song_id'].nunique() * 100
print(f"\nOVERLAP")
print(f"  Common songs: {len(overlap):,} ({coverage:.1f}%)")

TASTE PROFILE
  Records: 48,373,586
  Users: 1,019,318
  Songs: 384,546
  Avg plays: 2.9

SONG METADATA
  Songs: 10,000
  Artists: 4,412
  Year range: 0-2010
  Avg duration: 238.5s

OVERLAP
  Common songs: 3,675 (1.0%)


## 3. Join Datasets

Merge taste profile with song metadata on song_id.

In [13]:
# Join datasets on song_id
merged_data = taste_profile.merge(
    songs_metadata, 
    on='song_id', 
    how='inner'
)

print(f"Joined dataset: {len(merged_data):,} records")
print(f"Columns: {merged_data.shape[1]}")
print(f"\nMemory usage: {merged_data.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Sample
display(merged_data.head())

Joined dataset: 772,661 records
Columns: 18

Memory usage: 425.9 MB


Unnamed: 0,user_id,song_id,play_count,title,artist_name,artist_id,release,year,duration,tempo,loudness,key,mode,time_signature,energy,danceability,artist_hotttnesss,song_hotttnesss
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOWEZSI12A81C21CE6,1,Tu Quieres Volver,Gipsy Kings,AR2UQQ51187B9AC816,Greatest Hits,1987,194.87302,165.006,-8.403,5,0,1,0.0,0.0,0.540631,0.778821
1,833c530ecda3d99deb8395f70400aa3999783d91,SOWEZSI12A81C21CE6,2,Tu Quieres Volver,Gipsy Kings,AR2UQQ51187B9AC816,Greatest Hits,1987,194.87302,165.006,-8.403,5,0,1,0.0,0.0,0.540631,0.778821
2,d6c5bd2b570b4faf8964d7ed04f3392ff505d2be,SOWEZSI12A81C21CE6,1,Tu Quieres Volver,Gipsy Kings,AR2UQQ51187B9AC816,Greatest Hits,1987,194.87302,165.006,-8.403,5,0,1,0.0,0.0,0.540631,0.778821
3,724534729c9f5dc72a009269c2c225883e4775d2,SOWEZSI12A81C21CE6,1,Tu Quieres Volver,Gipsy Kings,AR2UQQ51187B9AC816,Greatest Hits,1987,194.87302,165.006,-8.403,5,0,1,0.0,0.0,0.540631,0.778821
4,ee7aa84c164038c963cfd02a7e52a5598aa470c3,SOWEZSI12A81C21CE6,2,Tu Quieres Volver,Gipsy Kings,AR2UQQ51187B9AC816,Greatest Hits,1987,194.87302,165.006,-8.403,5,0,1,0.0,0.0,0.540631,0.778821


## 4. Save Datasets

Save all datasets in multiple formats.

In [None]:
# Create data directory
os.makedirs('../data', exist_ok=True)

# Save individual datasets
taste_profile.to_pickle('../data/taste_profile.pkl')
print(f"Saved: taste_profile ({len(taste_profile):,} rows)")

songs_metadata.to_pickle('../data/songs_metadata.pkl')
print(f"Saved: songs_metadata ({len(songs_metadata):,} rows)")

merged_data.to_pickle('../data/merged_data.pkl')
print(f"Saved: merged_data ({len(merged_data):,} rows)")

print("\nAll data saved successfully")

KeyboardInterrupt: 