# Data Preprocessing

In [1]:
import json
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import pickle
import os
from typing import Dict, List, Tuple, Set
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set random seed for reproducibility
np.random.seed(42)

print("🎵 GNN DATA PREPROCESSING PIPELINE")
print("=" * 60)
print("🎯 Goal: Transform sampled playlists into GNN-ready graph structure")
print("📊 Input: Sampled playlist JSON file")
print("📈 Output: Node mappings, edge lists, features, train/val/test splits")
print("📋 Split Ratio: 70% train / 15% validation / 15% test")
print()

🎵 GNN DATA PREPROCESSING PIPELINE
🎯 Goal: Transform sampled playlists into GNN-ready graph structure
📊 Input: Sampled playlist JSON file
📈 Output: Node mappings, edge lists, features, train/val/test splits
📋 Split Ratio: 70% train / 15% validation / 15% test



## 1. Load and Explore Sampled Data

In [3]:
def load_sampled_data(file_path: str) -> Tuple[List[Dict], Dict]:
    """Load the sampled dataset"""
    print(f"📂 Loading sampled data from: {file_path}")

    with open(file_path, 'r') as f:
        data = json.load(f)

    playlists = data.get('playlists', [])
    info = data.get('info', {})
    sampling_stats = data.get('sampling_stats', {})

    print(f"✅ Loaded {len(playlists):,} playlists")
    print(f"📊 Sampling method: {info.get('sampling_method', 'unknown')}")

    return playlists, {'info': info, 'sampling_stats': sampling_stats}

def explore_data_structure(playlists: List[Dict]) -> Dict:
    """Explore the structure of sampled data"""
    print("🔍 EXPLORING DATA STRUCTURE")
    print("=" * 40)

    # Basic statistics
    total_playlists = len(playlists)
    playlist_lengths = [len(p.get('tracks', [])) for p in playlists]

    # Extract all entities
    all_tracks = set()
    all_artists = set()
    all_albums = set()
    user_playlist_count = Counter()

    for playlist in playlists:
        # Extract user ID (simplified)
        name = playlist.get('name', '').strip()
        user_id = name.split()[0] if name else f"user_{playlist.get('pid', 0) % 1000}"
        user_playlist_count[user_id] += 1

        # Extract track info
        tracks = playlist.get('tracks', [])
        for track in tracks:
            track_uri = track.get('track_uri', '')
            artist_uri = track.get('artist_uri', '')
            album_uri = track.get('album_uri', '')

            if track_uri:
                all_tracks.add(track_uri)
            if artist_uri:
                all_artists.add(artist_uri)
            if album_uri:
                all_albums.add(album_uri)

    stats = {
        'num_playlists': total_playlists,
        'num_unique_tracks': len(all_tracks),
        'num_unique_artists': len(all_artists),
        'num_unique_albums': len(all_albums),
        'num_unique_users': len(user_playlist_count),
        'avg_playlist_length': np.mean(playlist_lengths),
        'min_playlist_length': min(playlist_lengths),
        'max_playlist_length': max(playlist_lengths),
        'total_track_occurrences': sum(playlist_lengths)
    }

    # Print statistics
    print(f"📊 Dataset Statistics:")
    print(f"   • Playlists: {stats['num_playlists']:,}")
    print(f"   • Unique tracks: {stats['num_unique_tracks']:,}")
    print(f"   • Unique artists: {stats['num_unique_artists']:,}")
    print(f"   • Unique albums: {stats['num_unique_albums']:,}")
    print(f"   • Unique users: {stats['num_unique_users']:,}")
    print(f"   • Avg playlist length: {stats['avg_playlist_length']:.1f}")
    print(f"   • Playlist length range: {stats['min_playlist_length']}-{stats['max_playlist_length']}")
    print(f"   • Total track occurrences: {stats['total_track_occurrences']:,}")
    print()

    return stats

In [4]:
sampled_data_path = "../data/processed/spotify_scaled_hybrid_7500.json"

playlists, metadata = load_sampled_data(sampled_data_path)
data_stats = explore_data_structure(playlists)

📂 Loading sampled data from: ../data/processed/spotify_scaled_hybrid_7500.json
✅ Loaded 2,495 playlists
📊 Sampling method: scaled_hybrid_core_based_stratified_streaming
🔍 EXPLORING DATA STRUCTURE
📊 Dataset Statistics:
   • Playlists: 2,495
   • Unique tracks: 58,160
   • Unique artists: 17,069
   • Unique albums: 34,430
   • Unique users: 1,446
   • Avg playlist length: 44.4
   • Playlist length range: 10-100
   • Total track occurrences: 110,785



## 2. Create Entity Mappings

In [5]:
class EntityMapper:
    """Create bidirectional mappings between entities and integer IDs"""

    def __init__(self):
        self.mappings = {}
        self.reverse_mappings = {}
        self.entity_counts = {}

    def create_mappings(self, playlists: List[Dict]) -> Dict:
        """Create mappings for all entities"""
        print("🗺️  CREATING ENTITY MAPPINGS")
        print("=" * 40)

        # Collect all entities
        entities = {
            'playlists': {},
            'tracks': {},
            'artists': {},
            'albums': {},
            'users': {}
        }

        # Extract entities from playlists
        print("📊 Extracting entities...")

        for playlist in playlists:
            # Playlist ID
            pid = playlist.get('pid')
            if pid is not None:
                entities['playlists'][str(pid)] = playlist

            # User ID (simplified extraction)
            name = playlist.get('name', '').strip()
            user_id = name.split()[0] if name else f"user_{pid % 1000}"
            entities['users'][user_id] = entities['users'].get(user_id, 0) + 1

            # Track, artist, album info
            tracks = playlist.get('tracks', [])
            for track in tracks:
                track_uri = track.get('track_uri', '')
                artist_uri = track.get('artist_uri', '')
                album_uri = track.get('album_uri', '')

                if track_uri:
                    entities['tracks'][track_uri] = track
                if artist_uri:
                    entities['artists'][artist_uri] = {
                        'artist_name': track.get('artist_name', ''),
                        'artist_uri': artist_uri
                    }
                if album_uri:
                    entities['albums'][album_uri] = {
                        'album_name': track.get('album_name', ''),
                        'album_uri': album_uri
                    }

        # Create integer mappings
        print("🔢 Creating integer ID mappings...")

        for entity_type, entity_dict in entities.items():
            entity_list = list(entity_dict.keys())

            # Create forward mapping (entity -> int)
            self.mappings[entity_type] = {
                entity: idx for idx, entity in enumerate(entity_list)
            }

            # Create reverse mapping (int -> entity)
            self.reverse_mappings[entity_type] = {
                idx: entity for entity, idx in self.mappings[entity_type].items()
            }

            # Store counts
            self.entity_counts[entity_type] = len(entity_list)

            print(f"   ✅ {entity_type}: {len(entity_list):,} entities")

        print()
        return self.mappings

    def get_mapping_stats(self) -> Dict:
        """Get statistics about the mappings"""
        return {
            'entity_counts': self.entity_counts,
            'total_nodes': sum(self.entity_counts.values())
        }

In [6]:
# Create entity mappings
mapper = EntityMapper()
mappings = mapper.create_mappings(playlists)
mapping_stats = mapper.get_mapping_stats()

print(f"📈 Total graph nodes: {mapping_stats['total_nodes']:,}")

🗺️  CREATING ENTITY MAPPINGS
📊 Extracting entities...
🔢 Creating integer ID mappings...
   ✅ playlists: 2,495 entities
   ✅ tracks: 58,160 entities
   ✅ artists: 17,069 entities
   ✅ albums: 34,430 entities
   ✅ users: 1,446 entities

📈 Total graph nodes: 113,600


## 3. Build Graph Edges

In [7]:
class GraphBuilder:
    """Build edges for the music recommendation graph"""

    def __init__(self, mappings: Dict, entity_counts: Dict):
        self.mappings = mappings
        self.entity_counts = entity_counts
        self.edges = {}

    def build_edges(self, playlists: List[Dict]) -> Dict:
        """Build all edge types for the graph"""
        print("🔗 BUILDING GRAPH EDGES")
        print("=" * 40)

        # Initialize edge lists
        self.edges = {
            'playlist_track': [],      # Playlist contains track
            'track_artist': [],        # Track by artist
            'track_album': [],         # Track in album
            'user_playlist': [],       # User created playlist
            'playlist_user': []        # Reverse of user_playlist
        }

        print("🔗 Extracting relationships...")

        for playlist in playlists:
            pid = playlist.get('pid')
            playlist_name = playlist.get('name', '').strip()

            # Get mapped IDs
            playlist_id = self.mappings['playlists'].get(str(pid))
            if playlist_id is None:
                continue

            # Extract user
            user_name = playlist_name.split()[0] if playlist_name else f"user_{pid % 1000}"
            user_id = self.mappings['users'].get(user_name)

            # User-Playlist edges
            if user_id is not None:
                self.edges['user_playlist'].append([user_id, playlist_id])
                self.edges['playlist_user'].append([playlist_id, user_id])

            # Process tracks
            tracks = playlist.get('tracks', [])
            for track in tracks:
                track_uri = track.get('track_uri', '')
                artist_uri = track.get('artist_uri', '')
                album_uri = track.get('album_uri', '')

                track_id = self.mappings['tracks'].get(track_uri)

                if track_id is not None:
                    # Playlist-Track edge
                    self.edges['playlist_track'].append([playlist_id, track_id])

                    # Track-Artist edge
                    artist_id = self.mappings['artists'].get(artist_uri)
                    if artist_id is not None:
                        self.edges['track_artist'].append([track_id, artist_id])

                    # Track-Album edge
                    album_id = self.mappings['albums'].get(album_uri)
                    if album_id is not None:
                        self.edges['track_album'].append([track_id, album_id])

        # Convert to numpy arrays and remove duplicates
        print("🔧 Processing edge lists...")

        for edge_type, edge_list in self.edges.items():
            if edge_list:
                # Convert to numpy array
                edges_array = np.array(edge_list)

                # Remove duplicates
                unique_edges = np.unique(edges_array, axis=0)

                self.edges[edge_type] = unique_edges

                print(f"   ✅ {edge_type}: {len(unique_edges):,} edges")
            else:
                self.edges[edge_type] = np.array([]).reshape(0, 2)
                print(f"   ⚠️  {edge_type}: 0 edges")

        print()
        return self.edges

    def get_graph_statistics(self) -> Dict:
        """Calculate graph statistics"""
        stats = {}

        for edge_type, edges in self.edges.items():
            stats[edge_type] = {
                'num_edges': len(edges),
                'density': len(edges) / (self.entity_counts.get('playlists', 1) * self.entity_counts.get('tracks', 1)) if 'playlist' in edge_type and 'track' in edge_type else 0
            }

        return stats

In [8]:
graph_builder = GraphBuilder(mappings, mapper.entity_counts)
edges = graph_builder.build_edges(playlists)
graph_stats = graph_builder.get_graph_statistics()

# Print graph statistics
print("📊 Graph Statistics:")
for edge_type, stats in graph_stats.items():
    print(f"   • {edge_type}: {stats['num_edges']:,} edges")

🔗 BUILDING GRAPH EDGES
🔗 Extracting relationships...
🔧 Processing edge lists...
   ✅ playlist_track: 109,716 edges
   ✅ track_artist: 58,160 edges
   ✅ track_album: 58,160 edges
   ✅ user_playlist: 2,495 edges
   ✅ playlist_user: 2,495 edges

📊 Graph Statistics:
   • playlist_track: 109,716 edges
   • track_artist: 58,160 edges
   • track_album: 58,160 edges
   • user_playlist: 2,495 edges
   • playlist_user: 2,495 edges


## Create Node Features

In [9]:
class FeatureExtractor:
    """Extract features for graph nodes"""

    def __init__(self, mappings: Dict, entity_counts: Dict):
        self.mappings = mappings
        self.entity_counts = entity_counts
        self.features = {}

    def extract_playlist_features(self, playlists: List[Dict]) -> np.ndarray:
        """Extract features for playlist nodes"""
        print("🎵 Extracting playlist features...")

        num_playlists = self.entity_counts['playlists']

        # Feature dimensions
        features = []

        # Create mapping from PID to playlist data
        pid_to_playlist = {str(p.get('pid')): p for p in playlists}

        for playlist_idx in range(num_playlists):
            # Get original playlist ID
            original_pid = mapper.reverse_mappings['playlists'][playlist_idx]
            playlist_data = pid_to_playlist.get(original_pid, {})

            # Extract features
            playlist_features = []

            # Basic features
            num_tracks = len(playlist_data.get('tracks', []))
            num_followers = playlist_data.get('num_followers', 0)
            is_collaborative = 1 if playlist_data.get('collaborative', False) else 0

            # Temporal features
            modified_at = playlist_data.get('modified_at', 0)
            # Normalize timestamp (simple approach)
            normalized_time = (modified_at - 1400000000) / 100000000 if modified_at > 0 else 0

            # Text features (simple)
            name = playlist_data.get('name', '')
            has_name = 1 if len(name.strip()) > 0 else 0
            name_length = len(name.strip())

            # Combine features
            playlist_features = [
                num_tracks,
                np.log1p(num_followers),  # Log transform followers
                is_collaborative,
                normalized_time,
                has_name,
                name_length
            ]

            features.append(playlist_features)

        features_array = np.array(features, dtype=np.float32)
        print(f"   ✅ Playlist features shape: {features_array.shape}")

        return features_array

    def extract_track_features(self, playlists: List[Dict]) -> np.ndarray:
        """Extract features for track nodes"""
        print("🎼 Extracting track features...")

        num_tracks = self.entity_counts['tracks']

        # Track statistics
        track_stats = defaultdict(lambda: {
            'playlist_count': 0,
            'total_position': 0,
            'positions': [],
            'durations': []
        })

        # Collect track statistics
        for playlist in playlists:
            tracks = playlist.get('tracks', [])
            for pos, track in enumerate(tracks):
                track_uri = track.get('track_uri', '')
                duration = track.get('duration_ms', 0)

                if track_uri:
                    track_stats[track_uri]['playlist_count'] += 1
                    track_stats[track_uri]['total_position'] += pos
                    track_stats[track_uri]['positions'].append(pos)
                    if duration > 0:
                        track_stats[track_uri]['durations'].append(duration)

        # Create feature matrix
        features = []

        for track_idx in range(num_tracks):
            # Get original track URI
            track_uri = mapper.reverse_mappings['tracks'][track_idx]
            stats = track_stats[track_uri]

            # Extract features
            playlist_count = stats['playlist_count']
            avg_position = stats['total_position'] / max(playlist_count, 1)
            position_std = np.std(stats['positions']) if stats['positions'] else 0
            avg_duration = np.mean(stats['durations']) if stats['durations'] else 180000  # Default 3 min

            track_features = [
                np.log1p(playlist_count),  # Log of popularity
                avg_position,              # Average position in playlists
                position_std,              # Position variability
                avg_duration / 60000,      # Duration in minutes
            ]

            features.append(track_features)

        features_array = np.array(features, dtype=np.float32)
        print(f"   ✅ Track features shape: {features_array.shape}")

        return features_array

    def extract_user_features(self, playlists: List[Dict]) -> np.ndarray:
        """Extract features for user nodes"""
        print("👥 Extracting user features...")

        num_users = self.entity_counts['users']

        # User statistics
        user_stats = defaultdict(lambda: {
            'playlist_count': 0,
            'total_tracks': 0,
            'unique_tracks': set(),
            'collaborative_count': 0
        })

        # Collect user statistics
        for playlist in playlists:
            name = playlist.get('name', '').strip()
            user_name = name.split()[0] if name else f"user_{playlist.get('pid', 0) % 1000}"

            tracks = playlist.get('tracks', [])
            is_collaborative = playlist.get('collaborative', False)

            user_stats[user_name]['playlist_count'] += 1
            user_stats[user_name]['total_tracks'] += len(tracks)
            user_stats[user_name]['unique_tracks'].update([t.get('track_uri', '') for t in tracks])
            if is_collaborative:
                user_stats[user_name]['collaborative_count'] += 1

        # Create feature matrix
        features = []

        for user_idx in range(num_users):
            # Get original user name
            user_name = mapper.reverse_mappings['users'][user_idx]
            stats = user_stats[user_name]

            # Extract features
            playlist_count = stats['playlist_count']
            avg_playlist_length = stats['total_tracks'] / max(playlist_count, 1)
            unique_track_count = len(stats['unique_tracks'])
            collaborative_ratio = stats['collaborative_count'] / max(playlist_count, 1)

            user_features = [
                np.log1p(playlist_count),     # Log of activity level
                avg_playlist_length,          # Average playlist length
                np.log1p(unique_track_count), # Log of music diversity
                collaborative_ratio           # Collaboration tendency
            ]

            features.append(user_features)

        features_array = np.array(features, dtype=np.float32)
        print(f"   ✅ User features shape: {features_array.shape}")

        return features_array

    def extract_all_features(self, playlists: List[Dict]) -> Dict[str, np.ndarray]:
        """Extract features for all node types"""
        print("🎨 EXTRACTING NODE FEATURES")
        print("=" * 40)

        features = {}

        # Extract features for each node type
        features['playlist'] = self.extract_playlist_features(playlists)
        features['track'] = self.extract_track_features(playlists)
        features['user'] = self.extract_user_features(playlists)

        # Simple features for artists and albums (placeholder)
        features['artist'] = np.random.randn(self.entity_counts['artists'], 4).astype(np.float32)
        features['album'] = np.random.randn(self.entity_counts['albums'], 4).astype(np.float32)

        print(f"   ⚠️  Artist/Album features: Using random placeholders")
        print()

        return features

In [10]:
feature_extractor = FeatureExtractor(mappings, mapper.entity_counts)
node_features = feature_extractor.extract_all_features(playlists)

🎨 EXTRACTING NODE FEATURES
🎵 Extracting playlist features...
   ✅ Playlist features shape: (2495, 6)
🎼 Extracting track features...
   ✅ Track features shape: (58160, 4)
👥 Extracting user features...
   ✅ User features shape: (1446, 4)
   ⚠️  Artist/Album features: Using random placeholders



## Create Train/Validation/Test Splits (70/15/15)

In [11]:
class DataSplitter:
    """Create train/validation/test splits for the recommendation task"""

    def __init__(self, edges: Dict, mappings: Dict):
        self.edges = edges
        self.mappings = mappings

    def create_playlist_track_splits(self, train_ratio: float = 0.7,
                                   val_ratio: float = 0.15,
                                   test_ratio: float = 0.15) -> Dict:
        """Create splits for playlist-track edges with 70/15/15 ratio"""
        print("✂️  CREATING TRAIN/VALIDATION/TEST SPLITS (70/15/15)")
        print("=" * 50)

        # Get playlist-track edges
        playlist_track_edges = self.edges['playlist_track']
        num_edges = len(playlist_track_edges)

        print(f"📊 Total playlist-track edges: {num_edges:,}")
        print(f"📊 Split ratios: {train_ratio:.0%} train, {val_ratio:.0%} val, {test_ratio:.0%} test")
        print(f"📊 This follows the standard Graph ML practice for robust evaluation")

        # Shuffle edges
        indices = np.random.permutation(num_edges)

        # Calculate split sizes
        train_size = int(num_edges * train_ratio)
        val_size = int(num_edges * val_ratio)
        test_size = num_edges - train_size - val_size

        print(f"📈 Calculated split sizes:")
        print(f"   • Train: {train_size:,} edges ({train_size/num_edges:.1%})")
        print(f"   • Validation: {val_size:,} edges ({val_size/num_edges:.1%})")
        print(f"   • Test: {test_size:,} edges ({test_size/num_edges:.1%})")

        # Create splits
        train_indices = indices[:train_size]
        val_indices = indices[train_size:train_size + val_size]
        test_indices = indices[train_size + val_size:]

        splits = {
            'train_edges': playlist_track_edges[train_indices],
            'val_edges': playlist_track_edges[val_indices],
            'test_edges': playlist_track_edges[test_indices],
            'train_indices': train_indices,
            'val_indices': val_indices,
            'test_indices': test_indices,
            'split_ratios': {
                'train': train_ratio,
                'val': val_ratio,
                'test': test_ratio
            }
        }

        print(f"✅ Final split sizes:")
        print(f"   • Train edges: {len(splits['train_edges']):,}")
        print(f"   • Validation edges: {len(splits['val_edges']):,}")
        print(f"   • Test edges: {len(splits['test_edges']):,}")
        print()

        return splits

    def create_negative_samples(self, positive_edges: np.ndarray,
                              num_playlists: int, num_tracks: int,
                              num_negative: int = None) -> np.ndarray:
        """Create negative samples for link prediction"""
        if num_negative is None:
            num_negative = len(positive_edges)

        # Create set of positive edges for efficient lookup
        positive_set = set(map(tuple, positive_edges))

        # Sample negative edges
        negative_edges = []
        max_attempts = num_negative * 10  # Prevent infinite loop
        attempts = 0

        while len(negative_edges) < num_negative and attempts < max_attempts:
            # Random playlist and track
            playlist_id = np.random.randint(0, num_playlists)
            track_id = np.random.randint(0, num_tracks)

            # Check if this is not a positive edge
            if (playlist_id, track_id) not in positive_set:
                negative_edges.append([playlist_id, track_id])

            attempts += 1

        return np.array(negative_edges)


In [12]:
splitter = DataSplitter(edges, mappings)
splits = splitter.create_playlist_track_splits(train_ratio=0.7, val_ratio=0.15, test_ratio=0.15)

# Create negative samples for evaluation
print("🔄 Creating negative samples for evaluation...")
negative_val = splitter.create_negative_samples(
    splits['val_edges'],
    mapper.entity_counts['playlists'],
    mapper.entity_counts['tracks'],
    num_negative=len(splits['val_edges'])
)

negative_test = splitter.create_negative_samples(
    splits['test_edges'],
    mapper.entity_counts['playlists'],
    mapper.entity_counts['tracks'],
    num_negative=len(splits['test_edges'])
)

print(f"✅ Negative validation samples: {len(negative_val):,}")
print(f"✅ Negative test samples: {len(negative_test):,}")
print(f"💡 Negative samples ensure balanced evaluation (1:1 pos:neg ratio)")

✂️  CREATING TRAIN/VALIDATION/TEST SPLITS (70/15/15)
📊 Total playlist-track edges: 109,716
📊 Split ratios: 70% train, 15% val, 15% test
📊 This follows the standard Graph ML practice for robust evaluation
📈 Calculated split sizes:
   • Train: 76,801 edges (70.0%)
   • Validation: 16,457 edges (15.0%)
   • Test: 16,458 edges (15.0%)
✅ Final split sizes:
   • Train edges: 76,801
   • Validation edges: 16,457
   • Test edges: 16,458

🔄 Creating negative samples for evaluation...
✅ Negative validation samples: 16,457
✅ Negative test samples: 16,458
💡 Negative samples ensure balanced evaluation (1:1 pos:neg ratio)


## Save Preprocessed Data

In [13]:
def save_preprocessed_data(output_dir: str, mappings: Dict, edges: Dict,
                         features: Dict, splits: Dict,
                         negative_val: np.ndarray, negative_test: np.ndarray,
                         entity_counts: Dict) -> str:
    """Save all preprocessed data"""
    print("💾 SAVING PREPROCESSED DATA")
    print("=" * 40)

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)

    # Save mappings
    with open(f"{output_dir}/mappings.pkl", 'wb') as f:
        pickle.dump(mappings, f)
    print(f"✅ Saved mappings to {output_dir}/mappings.pkl")

    # Save entity counts
    with open(f"{output_dir}/entity_counts.pkl", 'wb') as f:
        pickle.dump(entity_counts, f)
    print(f"✅ Saved entity counts to {output_dir}/entity_counts.pkl")

    # Save edges
    np.savez(f"{output_dir}/edges.npz", **edges)
    print(f"✅ Saved edges to {output_dir}/edges.npz")

    # Save features
    np.savez(f"{output_dir}/features.npz", **features)
    print(f"✅ Saved features to {output_dir}/features.npz")

    # Save splits
    splits_with_negatives = {
        **splits,
        'negative_val': negative_val,
        'negative_test': negative_test
    }
    np.savez(f"{output_dir}/splits.npz", **splits_with_negatives)
    print(f"✅ Saved splits to {output_dir}/splits.npz")

    # Save metadata
    metadata = {
        'created_at': datetime.now().isoformat(),
        'entity_counts': entity_counts,
        'feature_dimensions': {k: v.shape for k, v in features.items()},
        'edge_counts': {k: len(v) for k, v in edges.items()},
        'split_sizes': {
            'train': len(splits['train_edges']),
            'val': len(splits['val_edges']),
            'test': len(splits['test_edges'])
        },
        'split_ratios': splits['split_ratios'],
        'preprocessing_notes': {
            'split_strategy': '70/15/15 ratio following Graph ML best practices',
            'negative_sampling': '1:1 positive to negative ratio',
            'edge_type_focus': 'playlist-track edges for recommendation task'
        }
    }

    with open(f"{output_dir}/metadata.json", 'w') as f:
        json.dump(metadata, f, indent=2)
    print(f"✅ Saved metadata to {output_dir}/metadata.json")

    print(f"\n🎉 All preprocessed data saved to: {output_dir}")
    return output_dir

In [14]:
output_directory = "../data/processed/gnn_ready"

saved_path = save_preprocessed_data(
    output_dir=output_directory,
    mappings=mappings,
    edges=edges,
    features=node_features,
    splits=splits,
    negative_val=negative_val,
    negative_test=negative_test,
    entity_counts=mapper.entity_counts
)

💾 SAVING PREPROCESSED DATA
✅ Saved mappings to ../data/processed/gnn_ready/mappings.pkl
✅ Saved entity counts to ../data/processed/gnn_ready/entity_counts.pkl
✅ Saved edges to ../data/processed/gnn_ready/edges.npz
✅ Saved features to ../data/processed/gnn_ready/features.npz
✅ Saved splits to ../data/processed/gnn_ready/splits.npz
✅ Saved metadata to ../data/processed/gnn_ready/metadata.json

🎉 All preprocessed data saved to: ../data/processed/gnn_ready


## Verification and Summary

In [15]:
def verify_preprocessed_data(data_dir: str):
    """Verify the preprocessed data"""
    print("✅ VERIFICATION SUMMARY")
    print("=" * 40)

    # Load and check each file
    files_to_check = [
        'mappings.pkl',
        'entity_counts.pkl',
        'edges.npz',
        'features.npz',
        'splits.npz',
        'metadata.json'
    ]

    for file_name in files_to_check:
        file_path = f"{data_dir}/{file_name}"
        if os.path.exists(file_path):
            file_size = os.path.getsize(file_path) / 1024 / 1024  # MB
            print(f"✅ {file_name}: {file_size:.2f} MB")
        else:
            print(f"❌ {file_name}: Missing!")

    # Load metadata and print summary
    try:
        with open(f"{data_dir}/metadata.json", 'r') as f:
            metadata = json.load(f)

        print(f"\n📊 PREPROCESSING SUMMARY:")
        print(f"   🎵 Playlists: {metadata['entity_counts']['playlists']:,}")
        print(f"   🎼 Tracks: {metadata['entity_counts']['tracks']:,}")
        print(f"   🎤 Artists: {metadata['entity_counts']['artists']:,}")
        print(f"   💿 Albums: {metadata['entity_counts']['albums']:,}")
        print(f"   👥 Users: {metadata['entity_counts']['users']:,}")
        print(f"   🔗 Total edges: {sum(metadata['edge_counts'].values()):,}")
        print(f"   📚 Training edges: {metadata['split_sizes']['train']:,} ({metadata['split_ratios']['train']:.0%})")
        print(f"   🔍 Validation edges: {metadata['split_sizes']['val']:,} ({metadata['split_ratios']['val']:.0%})")
        print(f"   🧪 Test edges: {metadata['split_sizes']['test']:,} ({metadata['split_ratios']['test']:.0%})")
        print(f"   ⚖️  Split strategy: {metadata['preprocessing_notes']['split_strategy']}")

    except Exception as e:
        print(f"⚠️  Could not load metadata: {e}")

In [18]:
verify_preprocessed_data(output_directory)

✅ VERIFICATION SUMMARY
✅ mappings.pkl: 25.60 MB
✅ entity_counts.pkl: 0.00 MB
✅ edges.npz: 46.05 MB
✅ features.npz: 10.47 MB
✅ splits.npz: 60.53 MB
✅ metadata.json: 0.00 MB

📊 PREPROCESSING SUMMARY:
   🎵 Playlists: 49,993
   🎼 Tracks: 356,998
   🎤 Artists: 72,209
   💿 Albums: 171,695
   👥 Users: 10,430
   🔗 Total edges: 3,017,685
   📚 Training edges: 1,542,592 (70%)
   🔍 Validation edges: 330,555 (15%)
   🧪 Test edges: 330,556 (15%)
   ⚖️  Split strategy: 70/15/15 ratio following Graph ML best practices
