In [3]:
import psycopg2
import pandas as pd
import numpy as np

In [4]:
# Create Connection to postgresql
# psql -h localhost -p 25432 -U musicbrainz musicbrainz_db

conn = psycopg2.connect(
    host="localhost",
    database="musicbrainz_db",
    user="musicbrainz",
    port=5432)

cursor = conn.cursor()

cursor.execute("select * from recording limit 1;")
cursor.fetchone()[0]
print("connection successful")

connection successful


In [5]:
%%time
# Loading recording table SQL tables

MB_recording = pd.read_sql('SELECT gid FROM recording', con = conn, columns = ['rec-gid'])
# MB_recording.set_index('gid', inplace = True)
MB_recording.head()



CPU times: user 7.06 s, sys: 2.43 s, total: 9.49 s
Wall time: 18.8 s


Unnamed: 0,gid
0,0f42ab32-22cd-4dcf-927b-a8d9a183d68b
1,4dce8f93-45ee-4573-8558-8cd321256233
2,48fabe3f-0fbd-4145-a917-83d164d6386f
3,b30b9943-9100-4d84-9ad2-69859ea88fbb
4,b55f1db3-c6d2-4645-b908-03e1017a99c2


In [6]:
%%time
# Loading redirect table SQL table

MB_redirects = pd.read_sql('select r.gid, rgr.gid from recording r join recording_gid_redirect rgr on rgr.new_id = r.id', con = conn)
MB_redirects.columns = ['old-rec-gid', 'new-rec-gid']
MB_redirects.set_index('old-rec-gid', inplace=True)
MB_redirects.head()



CPU times: user 1.32 s, sys: 150 ms, total: 1.47 s
Wall time: 6.93 s


Unnamed: 0_level_0,new-rec-gid
old-rec-gid,Unnamed: 1_level_1
ddda2877-0fbd-495e-a19b-6e9f4e97d711,8edeb408-437d-4e97-93f8-2fb982927fb0
690e5d24-c3ad-4c1f-b8bf-a6e4723d570f,7cc3640b-4939-4b42-9bac-637f90715cf2
9988c354-37ad-4325-bbeb-2a84c3e9f7de,f941f456-33ba-4f58-8584-07475981fc0b
6db9ab2f-1bb7-42bb-a1cf-62935e906c2a,f2b7197a-d85b-4d26-ae51-2f0826ac69c7
3fe033bd-f856-448d-a0dc-cdbf84bcc04d,bf1c5a09-32b9-4e2c-be0f-0cc4b22515ad


In [7]:
%%time
# Loading track table

MB_track = pd.read_sql('SELECT gid FROM track', con = conn)
MB_track.columns = ['track-gid']
MB_track.set_index('track-gid', inplace=True)
MB_track.head()



CPU times: user 9.59 s, sys: 3.23 s, total: 12.8 s
Wall time: 25.6 s


9b02977e-a03b-4a6b-a9a9-06e722bdcd7a
43da7544-6283-3159-84f9-537fe823a1a7
0b6b6283-a5a8-4560-9fa8-f68a430d86ea
fa124f9a-d8ea-36a3-bed3-c817fdbe13e2
e56c6d3c-09cf-33a0-81c5-ceade77c35dc


In [8]:

%%time
# Loading canonical recording mbid table

MB_canonical = pd.read_sql('SELECT recording_mbid, canonical_recording_mbid FROM mapping.canonical_recording_redirect', con = conn)
MB_canonical.columns = ['recording-gid', 'canonical-recording-gid']
MB_canonical.set_index('recording-gid', inplace=True)
MB_canonical.head()



CPU times: user 2.29 s, sys: 746 ms, total: 3.03 s
Wall time: 6 s


Unnamed: 0_level_0,canonical-recording-gid
recording-gid,Unnamed: 1_level_1
6ac02452-ee12-4f86-b389-bd20ba2fefcf,3e8eebfd-7613-4b3d-acbe-41709be76618
b4c26989-1b9e-4d50-8cde-56d6472e4bc3,3e8eebfd-7613-4b3d-acbe-41709be76618
601e1cf3-ad6c-4e38-9128-ba4d0d4b010f,b1050d12-b8af-409c-9cff-22759d93e240
35c4d840-e51f-4c07-9418-af9335b29642,f4680747-bf28-417a-ab33-af00577d8ac2
9ba7a9b9-a21c-4b12-8771-4c108b08b3e2,13b3875a-c89a-4be5-a6e4-0ca9164bc41d


# Loading Data

In [1]:
# Reads a list of file paths and reads + compiles data into a single pd.DataFrame
def read_files(file_path_repo):
    
    # init new empty main dataframe
    df = pd.DataFrame(columns = ['timestamp', 'artist-MBID', 'release-MBID', 'recording-MBID'])
    
    # Open a file with MLHD file paths to process
    with open(file_path_repo, 'r') as f:
        file_paths = f.readlines()
        file_paths= [item.strip() for item in file_paths]
    
    # Read files and compile into single df
    for pth in file_paths:
        temp = pd.read_csv(pth, sep='\t', names=['timestamp', 'artist-MBID', 'release-MBID', 'recording-MBID'])
        temp = temp[-temp['recording-MBID'].isna()]

        df = pd.concat([df, temp])
    
    return df

## Architechture:

1. Take a chunk of MBIDs (Test optimal chunk sizes too. Current optimal > 253k rows)
2. "Squish" series
    - i.e. Only take unique values from the series.
3. Pass squished series thorugh the following:
    - Get mbids, and check if they exist in the recording table.
    - Get mbids that don't exist in redirect, and pass it through MB_track
    - Get mbids that don't exist in recording, and pass it through MB_redirect
    - get mbids that don't exist in track.... There's no MBIDs that belong to track.
4. "Unsquish" the series.
    - i.e. Take processed output for squished values, and apply them to unsquished values.
    - This process ensures processing only on unique values.
    The output for this processing is then applied to duplicate values as well.

In [34]:
# A generic function for queries a series into another series.
    # Returns a series of boolean values corresponding to series_of_mbids 
    # (bool specifies if value exists in recording table or not.)

def query_in(series_to_query, series_to_query_in):
    # Queries all mbids in the recording table. Returns mbids that are present in recording table.
    mbids_in_series = series_to_query_in[series_to_query_in.isin(series_to_query)]
    
    # Makes a boolean map for all mbids in series_of_unique_mbids
    bool_map = series_to_query.isin(mbids_in_series)
    
    return bool_map
    # return MB_recording.gid[MB_recording.gid.isin(series_of_mbids)]

In [54]:
'''Squish function: 
1. Takes in input series with index number and recording-MBID.
2. Makes a mapping table with recording-MBIDs as the index, 
and a series of row-indices with that MBID as the values.'''

# def squish(input_series):

'''
1. take inp_series
2. Generate empty mapping_df where:
    - index = MBID
    - value = series of indices from inp_series
2. start traversing
3. if new ID: 
    - Add ID to mapping_df.index
    - Set the value in mapping_df as a list of newly updated indices for inp_series
4. if not new ID:
    - Add ID to list of indices for inp_series.
    - Update this ID in mapping_df.index
'''

# inp = df['recording-MBID'].reset_index(drop=True)
# inp_unique = inp.unique()

# mapping_df = pd.DataFrame(index=inp_unique)
# mapping_df

# Tackle this shit later. Focus on basic unoptimized code first!

In [56]:
%%time

# Just to prove that absolutely none of the of the 253k unique mbids are track-mbids disguised as rec-mbid.

# in_for_tack = in_for_rec[-out_for_rec]
in_for_track = pd.Series(df['recording-MBID'].unique())
out_for_track = query_in(in_for_track, MB_track)

out_for_track.value_counts()

CPU times: user 6.38 s, sys: 1.4 ms, total: 6.38 s
Wall time: 6.36 s


False    253218
dtype: int64

In [57]:
%%time
# Testing query_in() with recording MBID

in_for_rec = pd.Series(df['recording-MBID'].unique())
out_for_rec = query_in(in_for_rec, MB_recording.gid)

out_for_rec.value_counts()

CPU times: user 4.06 s, sys: 6.45 ms, total: 4.07 s
Wall time: 4.05 s


True     197481
False     55737
dtype: int64

In [33]:
%%time

in_for_redir = in_for_rec[out_for_rec]
# in_for_redir = in_for_rec

out_for_redir = query_in(in_for_redir, MB_redirects.index)
out_for_redir.value_counts()

CPU times: user 399 ms, sys: 0 ns, total: 399 ms
Wall time: 395 ms


False    1091721
True      335678
dtype: int64

In [None]:
# Replacing MBIDs with their redirected versions. Testing these "sure shot rec MBIDs" for canonicality

# final_rec_mbids = 