In [1]:
import lib.io_ as io
import lib.mapper_helper as mapper_helper
import pandas as pd
from time import monotonic

from rich import print
from rich.console import Console

In [2]:
console = Console()

time_logs = {}
flag_cache = False

In [3]:
time_logs['load_start'] = monotonic()
console.log('loading MLHD files...')

### LOADING MB TABLES ###

time_logs['MB_start'] = monotonic()

console.log('loading recording gids...')
MB_rec_gid = pd.read_parquet('warehouse/MB_tables/recording_gid.parquet')
MB_rec_gid.set_index('gid', inplace=True)

console.log('loading recording redirects...')
MB_rec_redirects = pd.read_parquet('warehouse/MB_tables/recording_redirects.parquet')
MB_rec_redirects.set_index('old', inplace=True)

console.log('loading recording canonical MBIDs...')
MB_rec_canonical = pd.read_parquet('warehouse/MB_tables/recording_canonical.parquet')
MB_rec_canonical.set_index('old', inplace=True)

console.log('loading artist credit list...')
MB_artist_credit_list = pd.read_parquet('warehouse/MB_tables/artist_credit.parquet')
MB_artist_credit_list.set_index('rec_gid', inplace=True)

# Converting MB_rec_gid to set for faster lookup
rec_gid_set = set(MB_rec_gid.index)

time_logs['MB_end'] = monotonic()
console.log("loaded MB tables. Took {} seconds".format(round(time_logs['MB_end'] - time_logs['MB_start'], 2)))

In [4]:
### Loading MLHD ###
time_logs['load_start'] = monotonic()

df = io.load_path_file('warehouse/samples/random_file_paths.txt', drop_subset = ['recording_MBID', 'artist_MBID'])
df.drop(['release_MBID'], axis = 1, inplace = True)
df.drop_duplicates(['artist_MBID', 'recording_MBID'], inplace = True)
df.rename({'artist_MBID': 'mlhd_artist_mbid', 'recording_MBID': 'mlhd_recording_mbid'}, inplace=True, axis=1)
df.reset_index(inplace=True, drop=True)

time_logs['load_end'] = monotonic()
console.log("loaded MLHD files with {} rows. Took {} seconds".format(df.shape[0], round(time_logs['load_end'] - time_logs['load_start'], 2)))

### Clean Up ###
console.log('Starting Cleanup...')
time_logs['clean_start'] = monotonic()

shape_before = df.shape[0]
df1 = mapper_helper.clean_rec(df, rec_gid_set, MB_rec_redirects, MB_rec_canonical, MB_artist_credit_list)
shape_after = df.shape[0]


df.to_csv('warehouse/mapper_outputs/mlhd_artist_credits.csv', index=False)
console.log(f"Dropped {shape_before - shape_after} rows")


time_logs['clean_end'] = monotonic()

console.log("Cleaned {} rows. Took {} seconds".format(
    df.shape[0], 
    round(time_logs['clean_end'] - time_logs['clean_start'], 2))
)

In [5]:
time_logs['mapper_start'] = monotonic()
num_rows = df.shape[0]

console.log("Mapping MBIDs...")

console.log('loading mbc_table...')
mbc_table = pd.read_parquet('warehouse/MB_tables/mbc_combined.parquet')
mbc_table.set_index('combined_lookup', inplace=True)

mapped_output = mapper_helper.mapper_mbc(df.iloc[:num_rows, :], mbc_table)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_input['received_rec_mbid'] = cleaned.map(lambda value: io.replace(value, mbc_table, 'recording_mbid'))


In [6]:
console.print("\nRows with different mlhd_canonical_mbids and received_recording_mbids:")
console.print(mapped_output.received_rec_mbid.compare(mapped_output.mlhd_canonical_mbid))

In [7]:
subset = df.iloc[:num_rows, :]

In [8]:
from unidecode import unidecode
import re
def lookup(row):
    return unidecode(re.sub(r'[^\w]+', '', row["artist_credit"] + row["rec_name"]).lower())

df['combined_lookup'] = df.apply(lookup, axis=1)

In [9]:
subset

Unnamed: 0,timestamp,mlhd_artist_mbid,mlhd_recording_mbid,mlhd_canonical_mbid,rec_name,artist_credit
0,1108412731,f4a31f0a-51dd-4fa7-986d-3095c40c5ed9,1deb956c-5439-4fbb-b026-5adb4330a934,1deb956c-5439-4fbb-b026-5adb4330a934,Anywhere,Evanescence
1,1108422818,db999c3f-f243-4a5f-88d6-0c25243b6661,14e9eb4e-155d-46ff-9a83-a8d5e1936c81,b0e80c50-6bca-415c-9515-1b4983329b80,Lady Lady,Mark Joseph
2,1108423325,ce58d854-7430-4231-aa44-97f0144b3372,eced9a9b-cd59-40f8-a580-f27094bd8a89,6ae8e64a-d208-4f4e-8d88-155ed0568344,Building a Mystery,Sarah McLachlan
3,1108594566,e6e879c0-3d56-4f12-b3c5-3ce459661a8e,0a8e9fce-b54c-45dd-8081-4aaa654ef4ec,da1308d1-6037-4ba1-b6fc-a643a4201140,Hallelujah,Jeff Buckley
4,1108679759,3c0a0074-4f26-4d3b-b723-a66bf6cc3753,06a5b648-0950-4892-a975-715291a5de6f,a2a800b4-4969-4bc6-88e9-6928b973890c,Penny & Me,Hanson
...,...,...,...,...,...,...
376032,1346409472,32f59126-2a1c-47c5-9076-64826c83a393,11c72720-977a-4cdb-ae6f-16d921023289,4c52692f-eebc-4d00-9d5a-bc69c4eafbfe,The Waves Crashing Silently Through the Domina...,Calexico
376033,1346409883,32f59126-2a1c-47c5-9076-64826c83a393,065212d3-88db-480c-bee2-05dc2653928c,59b1aade-c2d4-45c5-97ee-2dc34718872b,Fine Patina,Calexico
376034,1346500026,f7e7acc2-b61c-4e8c-80fd-ab354bf856e8,35092b28-1349-4c9a-9539-26d82e58f57d,f9f2601a-298a-4cfd-9b8a-b9e1c8118d5f,Somebody to Love,Jim Carrey
376035,1350727034,11ae9fbb-f3d7-4a47-936f-4c0a04d3b3b5,9e952c1c-7748-4899-bf1a-8ebe7b9f97b0,b1600489-05c3-4d71-80b8-b5558563df10,You’re Pretty Good Looking,The White Stripes


In [10]:
mbc_table

Unnamed: 0_level_0,recording_mbid
combined_lookup,Unnamed: 1_level_1
variousartistsimpersonations,1f104e8a-16b4-4e0d-bce0-16c869d6e739
variousartistspotpourrisega,7b1f193b-c9ba-48d6-bd6f-8afd02d489a2
variousartistspannoniadicserete,5b717dfc-365b-431b-83b2-97c31220c69e
variousartiststhemetimeradiohourfriendsneighbors,fc0b75c2-0e15-4348-8be6-94892d291031
variousartistseintollertagplaybackversion,e9fc070c-9c20-4ce6-adae-f23530c1487f
...,...
mermaidchunkyfriends,a67d8682-c957-43e8-8fd3-01280971ad3b
mermaidchunkythesegirls,bf9897b0-eeea-493e-a32e-47c6bb8799d7
mermaidchunkykingoftheherbs,e85e82cf-69e9-45fc-be84-86b940b91e14
psikedeliahbrainjamsiliconfield,ddb60b5b-4aeb-47fa-8032-63adac1d6926


In [11]:
df = df.join(mbc_table, on='combined_lookup')

In [12]:
mismatch = df.loc[df.mlhd_canonical_mbid != df.recording_mbid]

In [13]:
from lib.mb import get_table
standalone = get_table("""
select r.gid
from recording r
left join track t
on r.id = t.recording
where t.recording IS NULL;
""")
standalone

Unnamed: 0,gid
0,e2523c75-df45-46cb-8c6e-a9b546f81fcd
1,ed1b18cd-94e2-4d49-a182-2ff4a48bc4af
2,237cc410-5462-45af-aaad-96b65ef3b7bf
3,1b9361c5-98d0-4f57-8b17-e19da3d63849
4,8b99d972-8e74-4835-9200-e0ac6b47cbd0
...,...
93359,51a1673b-b0e4-49e5-a534-d5a17a7690c0
93360,d60b7cf4-f33a-4e10-bfc9-2f1d98881835
93361,489332a3-340e-491f-8d52-d982b5e4823b
93362,e4d9c463-e17e-4305-ac2b-94857432ecd5


In [14]:
filtered = mismatch.merge(standalone, how='left', indicator=True, left_on='mlhd_canonical_mbid', right_on='gid')

In [15]:
df_test = filtered.loc[filtered._merge != 'both']
df_test.head()

Unnamed: 0,timestamp,mlhd_artist_mbid,mlhd_recording_mbid,mlhd_canonical_mbid,rec_name,artist_credit,combined_lookup,recording_mbid,gid,_merge
1,1177279852,9a04dc8c-82f1-457a-a25a-3ff19e1b471e,3d316228-aeb1-4e13-9d86-104585ce7a15,7de95846-a918-4536-b453-d2ea41cc5ac0,Rose rouge,St. Germain,stgermainroserouge,57072609-4bbc-4d0f-b1f4-64fb54382451,,left_only
10,1272794676,859d4c71-8baf-4987-91ac-a138c9bba81f,d9bceb2e-773b-4c23-82ed-8b7a9e912207,bce05509-abd0-4ac7-8074-1fa55ed3a872,Higher Than You,Gonzales,gonzaleshigherthanyou,36cda267-8280-48a1-93d6-eaa1ee0cb275,,left_only
16,1221745477,b9472588-93f3-4922-a1a2-74082cdf9ce8,6c52335a-ca8e-4776-85a8-017143383d2c,dd8acb2a-158e-4e6b-befc-b349e01d3c28,Mad as Rabbits,Panic! at the Disco,panicatthediscomadasrabbits,a5f6645d-65ec-4b6f-ba13-615f6a8c1d21,,left_only
24,1197725149,b392b9e8-f96b-4b7b-9e02-6dc0e7dff6fc,843ad572-6135-4976-9038-7c331f11a5d8,0390c46e-6af0-41ba-95b4-9ea51778c941,Анти герой,Azis,azisantigeroi,820a0710-f2da-4354-aacf-1241bb59f0c2,,left_only
25,1199640212,98775867-3fef-4d2c-aea7-115c7e58326a,4a62aeae-cc88-4f2a-87ed-56cc08973479,4bad8718-13f9-4a4c-a97b-c8177a8d3bbc,Malevolent Landscape,Patrick O’Hearn,patrickohearnmalevolentlandscape,44383d44-23bf-44cd-987d-332f0b441bbd,,left_only


In [16]:
df_test['received_rec_mbid'] = [() for i in range(df_test.shape[0])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['received_rec_mbid'] = [() for i in range(df_test.shape[0])]


In [17]:
import lib.mapper_helper as mapper_helper

In [22]:
mapper_helper.write_html(base_path="/home/snaek/public_html" , df=df_test, suffix = 'test')

'https://wolf.metabrainz.org/~snaek/mlhd-lookup-22-09-17-test.html'

In [20]:
import config