In [1]:
# Essential Imports
from lib import io_ as io
from time import monotonic
from rich.progress import track
from numpy import nan
import pandas as pd
import config
import clean_master_config as cmc

# For pretty CLI
from rich import print
from rich.console import Console
console = Console()
console.clear()

# Getting started

In [4]:
io.generate_folders()   # Generate folders to write all outputs (as specified in config.py)
master_start = monotonic()  # Start a timer

OUTPUT_LOG = {} # Log discrepancies
TIME_LOGS = {}  # Logs time taken for each step

# Loading ENV variables
console.log("Loading ENV variables...")
# ENV = io.get_config()

MLHD_ROOT = config.MLHD_ROOT
WRITE_ROOT = config.WRITE_ROOT
LOG_WRITE_PATH = config.LOG_WRITE_PATH
LOG_EPOCH = config.LOG_EPOCH

# 1 Dimensional list of MLHD file paths
console.log("Generating MLHD Paths...")
MLHD_PATHS = io.generate_paths(MLHD_ROOT)

In [5]:
# %%time
### LOADING MB TABLES ###

TIME_LOGS['MB_start'] = monotonic()

console.log('loading recording gids...')
MB_rec_gid = pd.read_parquet('warehouse/MB_tables/recording_gid.parquet')
MB_rec_gid.set_index('gid', inplace=True)

console.log('loading recording redirects...')
MB_rec_redirects = pd.read_parquet('warehouse/MB_tables/recording_redirects.parquet')
MB_rec_redirects.set_index('old', inplace=True)

console.log('loading recording canonical MBIDs...')
MB_rec_canonical = pd.read_parquet('warehouse/MB_tables/recording_canonical.parquet')
MB_rec_canonical.set_index('old', inplace=True)

console.log('loading artist credit gids...')
MB_artist_credit_list = pd.read_parquet('warehouse/MB_tables/artist_credit_release_gid.parquet')
MB_artist_credit_list.set_index('recording_mbid', inplace=True)
MB_artist_credit_list['artist_mbids'] = MB_artist_credit_list.artist_mbids.map(lambda x: x.strip('{}'))

# Converting MB_rec_gid to set for faster lookup
rec_gid_set = set(MB_rec_gid.index)

TIME_LOGS['MB_end'] = monotonic()
console.log("loaded MB tables. Took {} seconds".format(round(TIME_LOGS['MB_end'] - TIME_LOGS['MB_start'], 2)))

# Testing Functions with Sample Data

In [7]:
# %%time
df = io.load_path(MLHD_PATHS[50])

In [8]:
# lower level functions for process_df
def check_in_rec(mbid):
    if mbid in rec_gid_set:
        return mbid
    else:
        return nan

def find_redirect(mbid):
    # return io.replace(mbid, MB_rec_redirects, 'new')
    try:
        return MB_rec_redirects.at[mbid, 'new']
    except KeyError:
        return nan

def find_canonical(mbid):
    try:
        return MB_rec_canonical.at[mbid, 'new']
    except KeyError:
        return nan

def find_artist_release(mbid, input_df):
    try:
        return tuple(io.replace(mbid, MB_artist_credit_list, col_name) for col_name in input_df.columns)
    except KeyError:
        return tuple(None for col_name in input_df.columns)

def process_df(df_input, keep_missing = cmc.KEEP_MISSING, turn_blank = cmc.TURN_BLANK):
    """Take an input df and process it into a cleaned df

    Args:
        df_input (pandas.DataFrame): input dataframe with columns: <timestamp, artist_MBID, release_MBID, recording_MBID>
        keep_missing (bool, optional): If True, keep rows with missing, unknown MBIDs to maintain the structure of the original data.
        turn_blank (bool, optional): If True, replace blank MBIDs with None

    Returns:
        pandas.DataFrame: Cleaned dataframe with columns: <timestamp, artist_MBID, release_MBID, recording_MBID>
    """

    # # 1. Get redirects for MBIDs that aren't present in rec_gid_set using MB_rec_redirects.
    # df_input['recording_MBID'] = df_input.recording_MBID.map(
    #     lambda x: io.replace(x, MB_rec_redirects, 'new') 
    #     if x not in rec_gid_set else x)
    
    # # 2. Find canonical recordings for all cleaned/uncleaned recording_MBIDs
    # df_input['recording_MBID'] = df_input['recording_MBID'].map(
    #     lambda x: io.replace(x, MB_rec_canonical, 'new')
    #     if io.replace(x, MB_rec_canonical, 'new') is not nan else x)

    # # 3. Fetch artist, release_MBIDs for all recording_MBIDs
    # artist_release_mbids = df_input['recording_MBID'].map(
    #     lambda x: io.replace_multi(x, MB_artist_credit_list))
    
    # df_input[['artist_MBID', 'release_MBID']] = pd.DataFrame(
    #     artist_release_mbids.tolist(), 
    #     columns = ['artist_MBID', 'release_MBID'], 
    #     index=df_input.index)

    

    return df_input

TIME_LOGS['process_test_start'] = monotonic()
processed = process_df(df)
TIME_LOGS['process_test_end'] = monotonic()

console.log(f"Processed df. Took {round(TIME_LOGS['process_test_end'] - TIME_LOGS['process_test_start'], 2)} s")

In [16]:
processed

Unnamed: 0,timestamp,artist_MBID,release_MBID,recording_MBID
0,1242583679,ba85753c-671a-409e-b813-1e3be41e2a2e,f0b4ab69-d604-4fb6-a667-cb9d5c37fc07,a0456176-7a99-4773-adba-f61d6793785d
1,1242583969,f660d7e2-a3bd-4456-a7be-86ec139c1016,98628c8e-366e-4f56-8b4b-804383ea0ec1,6afc8617-d545-4629-8151-9ee9d71c4742
2,1242584191,,,
3,1242584530,98fb7792-01fa-4ed1-a15d-20077a47210f,9eff9026-18df-4f3c-bccb-a6933109d38e,1fdcf214-b4d5-4490-8626-5afeda04b73d
4,1242584820,d50a4b89-ff1f-4659-9fde-f76f8d5b3c89,a87d64df-77d5-452e-bb7a-8186a733c302,657cf27c-8f4a-4758-aaaa-bd5686d7e103
...,...,...,...,...
105041,1362539926,e61ce7e2-f78d-4fee-ac56-ef1a784c87fa,331c66e6-c230-455b-b84c-3a81d24b55fb,c8bd591d-f2e4-443b-9ecb-cac8cda277ce
105042,1362540203,e61ce7e2-f78d-4fee-ac56-ef1a784c87fa,331c66e6-c230-455b-b84c-3a81d24b55fb,e38b7e10-a263-49d2-a854-b0f263fad744
105043,1362540416,e61ce7e2-f78d-4fee-ac56-ef1a784c87fa,331c66e6-c230-455b-b84c-3a81d24b55fb,2de962ae-b1e0-46d1-80d8-25dfe55f87ff
105044,1362540609,,,


In [6]:
df = io.load_path(MLHD_PATHS[50]) # Loading the table again, since the previous one was modified by process_df

In [7]:
# Some Stats/tests

print(
    "% Coverage of artist MBIDs: w/ cleaned recording_MBID:",
    round(processed.artist_MBID.isna().value_counts()[0]/processed.artist_MBID.isna().value_counts().sum(), 2),
    "\n"
    "% Coverage of artist MBIDs:",
    round(df.artist_MBID.isna().value_counts()[0]/df.artist_MBID.isna().value_counts().sum(), 2)
    )

print(
    "% Coverage of release MBIDs: cleaned recording_MBID:",
    round(processed.release_MBID.isna().value_counts()[0]/processed.release_MBID.isna().value_counts().sum(), 2),
    "\n"
    "% Coverage of release MBIDs:",
    round(df.release_MBID.isna().value_counts()[0]/df.release_MBID.isna().value_counts().sum(), 2)
    )

In [8]:
print("% valid artist_MBIDs", (processed.artist_MBID.isna().value_counts()[0]))
print("% valid release_MBIDs", (processed.release_MBID.isna().value_counts()[0]))
print("% valid recording_MBIDs", (processed.recording_MBID.isna().value_counts()[0]))

In [86]:
# Processing time for unique rows.

TIME_LOGS['process_unique_start'] = monotonic()

df_unique = df.drop_duplicates(subset = ['recording_MBID', 'artist_MBID', 'release_MBID'])
processed_unique = process_df(df_unique)

TIME_LOGS['process_unique_end'] = monotonic()

console.log(f"Processed df. Took {round(TIME_LOGS['process_unique_end'] - TIME_LOGS['process_unique_start'], 2)} s")
processed_unique

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [12]:
# Some Stats/tests for UNIQUE recording_MBIDs

print(
    "% Coverage of artist MBIDs: w/ UNIQUE cleaned recording_MBID:",
    round(processed_unique.artist_MBID.isna().value_counts()[0]/processed_unique.artist_MBID.isna().value_counts().sum(), 6),
    "\n"
    "% Coverage of artist MBIDs:",
    round(df_unique.artist_MBID.isna().value_counts()[0]/df_unique.artist_MBID.isna().value_counts().sum(), 6)
    )

print(
    "% Coverage of release MBIDs: w/ UNIQUE cleaned recording_MBID:",
    round(processed_unique.release_MBID.isna().value_counts()[0]/processed_unique.release_MBID.isna().value_counts().sum(), 6),
    "\n"
    "% Coverage of release MBIDs:",
    round(df_unique.release_MBID.isna().value_counts()[0]/df_unique.release_MBID.isna().value_counts().sum(), 6)
    )

print(
    "% Coverage of recording MBIDs: w/ UNIQUE cleaned recording_MBID:",
    round(processed_unique.recording_MBID.isna().value_counts()[0]/processed_unique.recording_MBID.isna().value_counts().sum(), 6),
    "\n"
    "% Coverage of release MBIDs:",
    round(df_unique.recording_MBID.isna().value_counts()[0]/df_unique.recording_MBID.isna().value_counts().sum(), 6)
    )

## Vectorization

In [73]:
from numba import njit, jit
from numba import vectorize as vectorize_numba

from numpy import vectorize as vectorize_numpy 
# From NumPy documentation: The vectorize function is provided primarily for convenience, not for performance. The implementation is essentially a for loop.

In [74]:
def check_in_rec(mbid):
    if mbid in rec_gid_set:
        return mbid
    else:
        return nan

check_in_rec_vectorized = vectorize_numpy(check_in_rec, otypes = [str])

In [59]:
%%time
check_in_rec_vectorized(df['recording_MBID'])

CPU times: user 54.2 ms, sys: 471 µs, total: 54.6 ms
Wall time: 52.6 ms


array(['nan', '6afc8617-d545-4629-8151-9ee9d71c4742', 'nan', ...,
       '2de962ae-b1e0-46d1-80d8-25dfe55f87ff', 'nan', 'nan'], dtype='<U36')

In [60]:
%%time
df['recording_MBID'].map(check_in_rec)

CPU times: user 31.2 ms, sys: 0 ns, total: 31.2 ms
Wall time: 29.7 ms


0                                          NaN
1         6afc8617-d545-4629-8151-9ee9d71c4742
2                                          NaN
3         1fdcf214-b4d5-4490-8626-5afeda04b73d
4         657cf27c-8f4a-4758-aaaa-bd5686d7e103
                          ...                 
105041    724801b2-8540-4619-8c90-2d3ad30275da
105042    94c5fb28-66e1-49cf-8cdd-3d876683eef6
105043    2de962ae-b1e0-46d1-80d8-25dfe55f87ff
105044                                     NaN
105045                                     NaN
Name: recording_MBID, Length: 105046, dtype: object

Apparently numpy.vectorize() function is actually slower in this case. :|

Let's try with Numba this time.

In [75]:
@jit
def check_in_rec_vectorized_numba(mbid):
    if mbid in rec_gid_set:
        return mbid
    else:
        return nan

In [82]:
%%time
try
    check_in_rec_vectorized_numba(df['recording_MBID'].values)
except:
    print("Some damn error that I can't seem to figure out.")

CPU times: user 5.18 ms, sys: 272 µs, total: 5.45 ms
Wall time: 4.29 ms


Probably the best option here is to intelligently use faster inbuild pandas methods and avoid the map/apply functions altogether.

OR let's try other libraries like Modin or Dask, etc.

In [85]:
import modin.pandas as mpd

In [None]:
def check_in_rec(mbid):
    if mbid in rec_gid_set:
        return mbid
    else:
        return nan

# def get_redir():

# Implementation

In [None]:
def driver(
    path_list, 
    keep_missing = clean_master_config.KEEP_MISSING, 
    turn_blank = clean_master_config.TURN_BLANK, 
    write_epochs = clean_master_config.WRITE_EPOCHS):

    """Driver function to clean all the tables in the path_list

    Args:
        path_list (list): List of paths to the tables to be cleaned
        keep_missing (bool, optional): If True, keep rows with missing, unknown MBIDs to maintain the structure of the original data.
        turn_blank (bool, optional): If True, replace blank MBIDs with None

    Returns:
        list: List of cleaned dataframes
    """

    cleaned_dfs = []
    for path in path_list:
        df = io.load_path(path)
        cleaned_dfs.append(process_df(df, keep_missing, turn_blank))

    return cleaned_dfs