Step 1: Install Synthetic Data Vault

In [4]:
import sdv

print(sdv.version.public)

1.15.0


Step 2: Prepare the Data and Metadata

In [5]:
import pandas as pd
from sdv.metadata import MultiTableMetadata

# Example data for Users table
users_data = pd.DataFrame({
    'user_id': range(1, 101),
    'spotify_id': [f'spotify_{i}' for i in range(1, 101)],
    'name': [f'user_{i}' for i in range(1, 101)],
    'email': [f'user_{i}@example.com' for i in range(1, 101)],
    'country': ['Country'] * 100,
    'age': [20 + i % 10 for i in range(1, 101)]
})

# Example data for Tracks table
tracks_data = pd.DataFrame({
    'track_id': range(1, 201),
    'title': [f'track_{i}' for i in range(1, 201)],
    'artist': [f'artist_{i % 50}' for i in range(1, 201)],
    'genre': ['genre'] * 200,
    'release_date': pd.date_range(start='1/1/2020', periods=200)
})

# Example data for UserTrackInteractions table
interactions_data = pd.DataFrame({
    'interaction_id': range(1, 501),
    'user_id': [i % 100 + 1 for i in range(1, 501)],
    'track_id': [i % 200 + 1 for i in range(1, 501)],
    'rating': [i % 5 + 1 for i in range(1, 501)],
    'timestamp': pd.date_range(start='1/1/2021', periods=500, freq='H')
})

# Define metadata
metadata_dict = {
    "METADATA_SPEC_VERSION": "MULTI_TABLE_V1",
    "tables": {
        "Users": {
            "primary_key": "user_id",
            "columns": {
                "user_id": {"sdtype": "id"},
                "spotify_id": {"sdtype": "text"},
                "name": {"sdtype": "text"},
                "email": {"sdtype": "email"},
                "country": {"sdtype": "categorical"},
                "age": {"sdtype": "numerical"}
            }
        },
        "Tracks": {
            "primary_key": "track_id",
            "columns": {
                "track_id": {"sdtype": "id"},
                "title": {"sdtype": "text"},
                "artist": {"sdtype": "text"},
                "genre": {"sdtype": "categorical"},
                "release_date": {"sdtype": "datetime"}
            }
        },
        "UserTrackInteractions": {
            "primary_key": "interaction_id",
            "columns": {
                "interaction_id": {"sdtype": "id"},
                "user_id": {"sdtype": "id"},
                "track_id": {"sdtype": "id"},
                "rating": {"sdtype": "numerical"},
                "timestamp": {"sdtype": "datetime"}
            }
        }
    },
    "relationships": [
        {
            "parent_table_name": "Users",
            "parent_primary_key": "user_id",
            "child_table_name": "UserTrackInteractions",
            "child_foreign_key": "user_id"
        },
        {
            "parent_table_name": "Tracks",
            "parent_primary_key": "track_id",
            "child_table_name": "UserTrackInteractions",
            "child_foreign_key": "track_id"
        }
    ]
}

metadata = MultiTableMetadata.load_from_dict(metadata_dict)


Step 3: Create a Dataset Dictionary

In [6]:
data = {
    "Users": users_data,
    "Tracks": tracks_data,
    "UserTrackInteractions": interactions_data
}


In [8]:
from sdv.multi_table import HMASynthesizer

synthesizer = HMASynthesizer(metadata)
synthesizer.fit(data)

synthetic_data = synthesizer.sample()


Preprocess Tables: 100%|██████████| 3/3 [00:02<00:00,  1.29it/s]



Learning relationships:


(1/2) Tables 'Users' and 'UserTrackInteractions' ('user_id'): 100%|██████████| 100/100 [00:06<00:00, 14.72it/s]
(2/2) Tables 'Tracks' and 'UserTrackInteractions' ('track_id'): 100%|██████████| 200/200 [00:11<00:00, 16.92it/s]





Modeling Tables: 100%|██████████| 3/3 [00:00<00:00,  7.01it/s]


In [9]:
synthetic_data["Users"].head()

Unnamed: 0,user_id,spotify_id,name,email,country,age
0,248989733,Drive he should. Exist old sing occur truth me...,According energy kid individual feeling. Kid m...,caitlinsnyder@example.com,Country,24
1,852025845,Character understand role group blue though or...,Pretty teacher card rule rock either as. Air a...,jacob89@example.net,Country,22
2,370288948,Pretty from future happen already finish. Than...,Political heart hand heart. Stay race finally ...,dkirby@example.com,Country,23
3,762713512,All must draw.\nShake race campaign. Foot tech...,Ok water affect try. Situation cell wall piece...,nathan96@example.net,Country,22
4,647592843,Much head through.\nFamily treat line have tou...,Add film activity imagine star usually though....,franciscocollins@example.com,Country,23


In [10]:
synthetic_users = synthetic_data['Users']
synthetic_tracks = synthetic_data['Tracks']
synthetic_interactions = synthetic_data['UserTrackInteractions']

synthetic_users.to_csv('synthetic_users.csv', index=False)
synthetic_tracks.to_csv('synthetic_tracks.csv', index=False)
synthetic_interactions.to_csv('synthetic_interactions.csv', index=False)
