In [49]:
import psycopg2
import pandas as pd
import numpy as np

In [2]:
# Create Connection to postgresql
# psql -h localhost -p 25432 -U musicbrainz musicbrainz_db

conn = psycopg2.connect(
    host="localhost",
    database="musicbrainz_db",
    user="musicbrainz",
    port=5432)

cursor = conn.cursor()

cursor.execute("select * from recording limit 1;")
cursor.fetchone()[0]
print("connection successful")

connection successful


In [3]:
# Reads a list of file paths and reads + compiles data into a single pd.DataFrame
def read_files(file_path_repo):
    
    # init new empty main dataframe
    df = pd.DataFrame(columns = ['timestamp', 'artist-MBID', 'release-MBID', 'recording-MBID'])
    
    # Open a file with MLHD file paths to process
    with open(file_path_repo, 'r') as f:
        file_paths = f.readlines()
        file_paths= [item.strip() for item in file_paths]
    
    # Read files and compile into single df
    for pth in file_paths:
        temp = pd.read_csv(pth, sep='\t', names=['timestamp', 'artist-MBID', 'release-MBID', 'recording-MBID'])
        temp = temp[-temp['recording-MBID'].isna()]

        df = pd.concat([df, temp])
    
    return df

In [4]:
%%time
df = read_files('random_file_paths.txt')
df.head()

CPU times: user 638 ms, sys: 75.7 ms, total: 713 ms
Wall time: 712 ms


Unnamed: 0,timestamp,artist-MBID,release-MBID,recording-MBID
0,1166994688,a49b5d41-7399-49ef-bcb8-a5779a30d2e9,af158d94-eb6f-4f1f-a8a2-f685b60b132e,317bdd0e-8956-4547-b8f0-fd233d10d378
1,1166995444,bbc5b66b-d037-4f26-aecf-0b129e7f876a,075cef57-9350-41a9-9a53-4391f013e164,279f9f44-abfc-406e-a686-92961577bee9
2,1167000369,a49b5d41-7399-49ef-bcb8-a5779a30d2e9,58c90879-d279-41c3-9ecc-b686ae75bc78,73eadf15-ebc0-40cd-a71c-30dce9d82d6b
3,1167000642,34e10b51-b5c6-4bc1-b70e-f05f141eda1e,2988a7b9-0bb1-4eaf-98ee-34eddf2e5740,08c272e6-02fe-4fb0-930c-2f342dd7d092
4,1167001189,b665b768-0d83-4363-950c-31ed39317c15,0d14ea49-7f2d-4c38-9986-cee6846a5bb5,254a6a58-d097-4307-ba1e-1fc15c8ac70e


In [112]:
%%time
# Loading Relevant SQL tables

MB_recording = pd.read_sql('SELECT gid FROM recording', con = conn, columns = ['rec-gid'])
# MB_recording.set_index('gid', inplace = True)
MB_recording.head()



CPU times: user 6.86 s, sys: 2.32 s, total: 9.18 s
Wall time: 17.4 s


Unnamed: 0,gid
0,0f42ab32-22cd-4dcf-927b-a8d9a183d68b
1,4dce8f93-45ee-4573-8558-8cd321256233
2,48fabe3f-0fbd-4145-a917-83d164d6386f
3,b30b9943-9100-4d84-9ad2-69859ea88fbb
4,b55f1db3-c6d2-4645-b908-03e1017a99c2


In [6]:
%%time
# Loading Relevant SQL tables

MB_redirects = pd.read_sql('select r.gid, rgr.gid from recording r join recording_gid_redirect rgr on rgr.new_id = r.id', con = conn)
MB_redirects.columns = ['old-rec-gid', 'new-rec-gid']
MB_redirects.set_index('old-rec-gid', inplace=True)
MB_redirects.head()



CPU times: user 1.3 s, sys: 116 ms, total: 1.42 s
Wall time: 7.9 s


Unnamed: 0_level_0,new-rec-gid
old-rec-gid,Unnamed: 1_level_1
dfbefafa-a7dc-4024-8a28-be15537591e9,c4793e5b-5825-4221-893d-8b3776289127
69102b65-98cd-46aa-a757-b24890c0e030,74324461-d321-4197-82f8-fdb41471ea8b
c876dbfd-159a-4d83-bfbb-fd5438c9db65,d6f71677-ad41-46a1-87bc-5f00c702eb8a
2457840d-75d6-4b2b-87de-ea6a0d02e9cf,b1b7370c-f7d8-4614-b41e-89d011e0108c
49a097f4-3fa5-48b5-ac49-dc3902af0ffe,3532d38e-7853-430f-b9b5-8501378729ce


In [7]:
%%time
# Loading Relevant SQL tables

MB_track = pd.read_sql('SELECT gid FROM track', con = conn)
MB_track.columns = ['track-gid']
MB_track.set_index('track-gid', inplace=True)
MB_track.head()



CPU times: user 9.62 s, sys: 2.89 s, total: 12.5 s
Wall time: 25.1 s


9b02977e-a03b-4a6b-a9a9-06e722bdcd7a
43da7544-6283-3159-84f9-537fe823a1a7
0b6b6283-a5a8-4560-9fa8-f68a430d86ea
fa124f9a-d8ea-36a3-bed3-c817fdbe13e2
e56c6d3c-09cf-33a0-81c5-ceade77c35dc


In [22]:

%%time
# Loading Relevant SQL tables

MB_canonical = pd.read_sql('SELECT recording_mbid, canonical_recording_mbid FROM mapping.canonical_recording_redirect', con = conn)
MB_canonical.columns = ['recording-gid', 'canonical-recording-gid']
MB_canonical.set_index('recording-gid', inplace=True)
MB_canonical.head()



CPU times: user 2.38 s, sys: 708 ms, total: 3.09 s
Wall time: 6.26 s


Unnamed: 0_level_0,canonical-recording-gid
recording-gid,Unnamed: 1_level_1
6ac02452-ee12-4f86-b389-bd20ba2fefcf,3e8eebfd-7613-4b3d-acbe-41709be76618
b4c26989-1b9e-4d50-8cde-56d6472e4bc3,3e8eebfd-7613-4b3d-acbe-41709be76618
601e1cf3-ad6c-4e38-9128-ba4d0d4b010f,b1050d12-b8af-409c-9cff-22759d93e240
35c4d840-e51f-4c07-9418-af9335b29642,f4680747-bf28-417a-ab33-af00577d8ac2
9ba7a9b9-a21c-4b12-8771-4c108b08b3e2,13b3875a-c89a-4be5-a6e4-0ca9164bc41d


In [150]:
'''
# Problem Statement:
1. Take in a series of recording-MBIDs
2. Mass query the series items in MB_recording table
3. Get a boolean map for series_of_mbids to check if each MBID exists in the MB table.
'''
def check_recording_existance(series_of_mbids):
    
    return MB_recording.gid[MB_recording.gid.isin(series_of_mbids)]


test_chunk = df['recording-MBID']
test_chunk.isin((check_recording_existance(test_chunk)))


'''
1. Takes in all MBIDs
2. Checks if all MBIDs are in MB_recording
3. Checks if '''

64019