In [1]:
# Essential Imports
from lib import io_ as io
from time import monotonic
from rich.progress import track
from numpy import nan
import pandas as pd
import config
import clean_master_config

# For pretty CLI
from rich import print
from rich.console import Console
console = Console()
console.clear()

In [2]:
### Getting started ###

io.generate_folders()   # Generate folders to write all outputs (as specified in config.py)
master_start = monotonic()  # Start a timer

OUTPUT_LOG = {} # Log discrepancies
TIME_LOGS = {}  # Logs time taken for each step

# Loading ENV variables
console.log("Loading ENV variables...")
# ENV = io.get_config()

MLHD_ROOT = config.MLHD_ROOT
WRITE_ROOT = config.WRITE_ROOT
LOG_WRITE_PATH = config.LOG_WRITE_PATH
LOG_EPOCH = config.LOG_EPOCH

# 1 Dimensional list of MLHD file paths
console.log("Generating MLHD Paths...")
MLHD_PATHS = io.generate_paths(MLHD_ROOT)

In [3]:
# %%time
### LOADING MB TABLES ###

TIME_LOGS['MB_start'] = monotonic()

console.log('loading recording gids...')
MB_rec_gid = pd.read_parquet('warehouse/MB_tables/recording_gid.parquet')
MB_rec_gid.set_index('gid', inplace=True)

console.log('loading recording redirects...')
MB_rec_redirects = pd.read_parquet('warehouse/MB_tables/recording_redirects.parquet')
MB_rec_redirects.set_index('old', inplace=True)

console.log('loading recording canonical MBIDs...')
MB_rec_canonical = pd.read_parquet('warehouse/MB_tables/recording_canonical.parquet')
MB_rec_canonical.set_index('old', inplace=True)

console.log('loading artist credit gids...')
MB_artist_credit_list = pd.read_parquet('warehouse/MB_tables/artist_credit_gid.parquet')
MB_artist_credit_list.set_index('rec_gid', inplace=True)

# Converting MB_rec_gid to set for faster lookup
rec_gid_set = set(MB_rec_gid.index)

TIME_LOGS['MB_end'] = monotonic()
console.log("loaded MB tables. Took {} seconds".format(round(TIME_LOGS['MB_end'] - TIME_LOGS['MB_start'], 2)))

# Testing Functions with Sample Data

In [33]:
# %%time
df = io.load_path(MLHD_PATHS[50])
df

Unnamed: 0,timestamp,artist_MBID,release_MBID,recording_MBID
0,1242583679,ba85753c-671a-409e-b813-1e3be41e2a2e,0a2d7193-fc4b-418c-8581-08d2695f884a,0d7960f0-ee33-4868-8e9e-7c705558b6e5
1,1242583969,f660d7e2-a3bd-4456-a7be-86ec139c1016,6ecac165-0267-3822-b995-cda1282ea5b5,6afc8617-d545-4629-8151-9ee9d71c4742
2,1242584191,371f152d-1643-4b54-b32b-dd13d4c23442,59c844ca-4c11-452d-8330-c107892319eb,e2477990-d9b1-43b6-b8db-be03e43559c4
3,1242584530,98fb7792-01fa-4ed1-a15d-20077a47210f,,1fdcf214-b4d5-4490-8626-5afeda04b73d
4,1242584820,d50a4b89-ff1f-4659-9fde-f76f8d5b3c89,,657cf27c-8f4a-4758-aaaa-bd5686d7e103
...,...,...,...,...
105041,1362539926,e61ce7e2-f78d-4fee-ac56-ef1a784c87fa,331c66e6-c230-455b-b84c-3a81d24b55fb,724801b2-8540-4619-8c90-2d3ad30275da
105042,1362540203,e61ce7e2-f78d-4fee-ac56-ef1a784c87fa,331c66e6-c230-455b-b84c-3a81d24b55fb,94c5fb28-66e1-49cf-8cdd-3d876683eef6
105043,1362540416,e61ce7e2-f78d-4fee-ac56-ef1a784c87fa,331c66e6-c230-455b-b84c-3a81d24b55fb,2de962ae-b1e0-46d1-80d8-25dfe55f87ff
105044,1362540609,e61ce7e2-f78d-4fee-ac56-ef1a784c87fa,331c66e6-c230-455b-b84c-3a81d24b55fb,


In [40]:

# df['recording_MBID_cleaned'] = df.recording_MBID.map(
#         lambda x: io.replace(x, MB_rec_redirects, 'new') 
#         if x not in rec_gid_set else x)

df['recording_MBID_uncleaned'] = df.recording_MBID.apply(lambda x: True if x in rec_gid_set else False)
df[-df['recording_MBID_uncleaned']]
# df

array(['0d7960f0-ee33-4868-8e9e-7c705558b6e5',
       'e2477990-d9b1-43b6-b8db-be03e43559c4',
       '5d73a81f-51fe-459a-bdcc-398fb8b54df8', ...,
       'aa604d7c-48ab-43d7-9d80-3a66f717d393',
       '3118dc17-6c96-4bfa-97b9-938402793e2f',
       '2c4e4d4b-b610-4434-a239-6f6f40aff3d2'], dtype=object)

In [32]:
df


Unnamed: 0,timestamp,artist_MBID,release_MBID,recording_MBID,recording_MBID_cleaned
0,1242583679,ba85753c-671a-409e-b813-1e3be41e2a2e,0a2d7193-fc4b-418c-8581-08d2695f884a,0d7960f0-ee33-4868-8e9e-7c705558b6e5,a0456176-7a99-4773-adba-f61d6793785d
1,1242583969,f660d7e2-a3bd-4456-a7be-86ec139c1016,6ecac165-0267-3822-b995-cda1282ea5b5,6afc8617-d545-4629-8151-9ee9d71c4742,6afc8617-d545-4629-8151-9ee9d71c4742
2,1242584191,371f152d-1643-4b54-b32b-dd13d4c23442,59c844ca-4c11-452d-8330-c107892319eb,e2477990-d9b1-43b6-b8db-be03e43559c4,
3,1242584530,98fb7792-01fa-4ed1-a15d-20077a47210f,,1fdcf214-b4d5-4490-8626-5afeda04b73d,1fdcf214-b4d5-4490-8626-5afeda04b73d
4,1242584820,d50a4b89-ff1f-4659-9fde-f76f8d5b3c89,,657cf27c-8f4a-4758-aaaa-bd5686d7e103,657cf27c-8f4a-4758-aaaa-bd5686d7e103
...,...,...,...,...,...
105041,1362539926,e61ce7e2-f78d-4fee-ac56-ef1a784c87fa,331c66e6-c230-455b-b84c-3a81d24b55fb,724801b2-8540-4619-8c90-2d3ad30275da,724801b2-8540-4619-8c90-2d3ad30275da
105042,1362540203,e61ce7e2-f78d-4fee-ac56-ef1a784c87fa,331c66e6-c230-455b-b84c-3a81d24b55fb,94c5fb28-66e1-49cf-8cdd-3d876683eef6,94c5fb28-66e1-49cf-8cdd-3d876683eef6
105043,1362540416,e61ce7e2-f78d-4fee-ac56-ef1a784c87fa,331c66e6-c230-455b-b84c-3a81d24b55fb,2de962ae-b1e0-46d1-80d8-25dfe55f87ff,2de962ae-b1e0-46d1-80d8-25dfe55f87ff
105044,1362540609,e61ce7e2-f78d-4fee-ac56-ef1a784c87fa,331c66e6-c230-455b-b84c-3a81d24b55fb,,


In [5]:
def process_df(df_input, keep_missing = clean_master_config.KEEP_MISSING, turn_blank = clean_master_config.TURN_BLANK):
    """Take an input df and process it into a cleaned df

    Args:
        df_input (pandas.DataFrame): input dataframe with columns: <timestamp, artist_MBID, release_MBID, recording_MBID>
        keep_missing (bool, optional): If True, keep rows with missing, unknown MBIDs to maintain the structure of the original data.
        turn_blank (bool, optional): If True, replace blank MBIDs with None

    Returns:
        pandas.DataFrame: Cleaned dataframe with columns: <timestamp, artist_MBID, release_MBID, recording_MBID>
    """

    # Check 

    # 1. Check if all rows in recording_MBID column are present in rec_gid_set
    
    # df_input['recording_MBID_uncleaned'] = df[-df.recording_MBID.apply(lambda x: x in rec_gid_set)]
    
    # 2. If not, check if they are present in MB_rec_redirects
    df_input['recording_MBID_cleaned'] = df_input.recording_MBID.map(
        lambda x: io.replace(x, MB_rec_redirects, 'new') 
        if x not in rec_gid_set else x)

    # 3. Find canonical recordings for all cleaned/uncleaned recording_MBIDs
    df_input['recording_MBID_cleaned'] = df_input['recording_MBID_cleaned'].map(
        lambda x: io.replace(x, MB_rec_canonical, 'new')
        if io.replace(x, MB_rec_canonical, 'new') is not nan else x)
    
    # 4. Find artist MBIDs for all cleaned/uncleaned recording_MBIDs -> (Find Artist MBIDs for all uncleaned mbids for statistics)
    df.recording_MBID.map(lambda x: io.replace(x, MB_artist_credit_list, 'artist_credit_gid'))

    # 5. Find release MBIDs for respective artist MBIDs

    return None

In [8]:
%%time

print(df.recording_MBID[df.recording_MBID.apply(lambda x: x in rec_gid_set)].map(lambda x: io.replace(x, MB_artist_credit_list, 'artist_credit_gid')).isna().value_counts())
print(df.recording_MBID[-df.recording_MBID.apply(lambda x: x in rec_gid_set)].map(lambda x: io.replace(x, MB_artist_credit_list, 'artist_credit_gid')).isna().value_counts())

CPU times: user 546 ms, sys: 569 µs, total: 547 ms
Wall time: 555 ms


In [6]:
MB_rec_redirects

Unnamed: 0_level_0,new
old,Unnamed: 1_level_1
c4793e5b-5825-4221-893d-8b3776289127,dfbefafa-a7dc-4024-8a28-be15537591e9
74324461-d321-4197-82f8-fdb41471ea8b,69102b65-98cd-46aa-a757-b24890c0e030
d6f71677-ad41-46a1-87bc-5f00c702eb8a,c876dbfd-159a-4d83-bfbb-fd5438c9db65
b1b7370c-f7d8-4614-b41e-89d011e0108c,2457840d-75d6-4b2b-87de-ea6a0d02e9cf
80f99633-339d-4ffd-9668-a81831dda379,ee141eff-5b22-4e8b-b01a-754b4d0e5222
...,...
3d532f99-20fd-476c-9f9e-757b9d3153d7,b988fddd-beca-4815-9c86-9207c4e3ab41
563382da-a6f8-4c78-956a-39bf6b2cd507,a2e7c584-7c59-40c8-99ac-ea684e615f10
76b0db25-612a-496e-b800-608ff0a8c4de,33e506c2-f79d-493d-a7e0-619d58bff8c0
0f3a284b-72ef-4dfc-8a9b-6cb926c46efe,f6744c92-5fb2-4b1e-9f3d-07568ffa3514
