In [1]:
import psycopg2
import pandas as pd
import numpy as np

In [2]:
# Create Connection to postgresql
# psql -h localhost -p 25432 -U musicbrainz musicbrainz_db

conn = psycopg2.connect(
    host="localhost",
    database="musicbrainz_db",
    user="musicbrainz",
    port=5432)

cursor = conn.cursor()

cursor.execute("select * from recording limit 1;")
cursor.fetchone()[0]
print("connection successful")

connection successful


In [3]:
# Reads a list of file paths and reads + compiles data into a single pd.DataFrame
def read_files(file_path_repo):
    
    # init new empty main dataframe
    df = pd.DataFrame(columns = ['timestamp', 'artist-MBID', 'release-MBID', 'recording-MBID'])
    
    # Open a file with MLHD file paths to process
    with open(file_path_repo, 'r') as f:
        file_paths = f.readlines()
        file_paths= [item.strip() for item in file_paths]
    
    # Read files and compile into single df
    for pth in file_paths:
        temp = pd.read_csv(pth, sep='\t', names=['timestamp', 'artist-MBID', 'release-MBID', 'recording-MBID'])
        temp = temp[-temp['recording-MBID'].isna()]

        df = pd.concat([df, temp])
    
    return df

In [4]:
%%time
df = read_files('random_file_paths.txt')
df.head()

CPU times: user 621 ms, sys: 104 ms, total: 725 ms
Wall time: 725 ms


Unnamed: 0,timestamp,artist-MBID,release-MBID,recording-MBID
0,1166994688,a49b5d41-7399-49ef-bcb8-a5779a30d2e9,af158d94-eb6f-4f1f-a8a2-f685b60b132e,317bdd0e-8956-4547-b8f0-fd233d10d378
1,1166995444,bbc5b66b-d037-4f26-aecf-0b129e7f876a,075cef57-9350-41a9-9a53-4391f013e164,279f9f44-abfc-406e-a686-92961577bee9
2,1167000369,a49b5d41-7399-49ef-bcb8-a5779a30d2e9,58c90879-d279-41c3-9ecc-b686ae75bc78,73eadf15-ebc0-40cd-a71c-30dce9d82d6b
3,1167000642,34e10b51-b5c6-4bc1-b70e-f05f141eda1e,2988a7b9-0bb1-4eaf-98ee-34eddf2e5740,08c272e6-02fe-4fb0-930c-2f342dd7d092
4,1167001189,b665b768-0d83-4363-950c-31ed39317c15,0d14ea49-7f2d-4c38-9986-cee6846a5bb5,254a6a58-d097-4307-ba1e-1fc15c8ac70e


In [5]:
%%time
# Loading recording table SQL tables

MB_recording = pd.read_sql('SELECT gid FROM recording', con = conn, columns = ['rec-gid'])
# MB_recording.set_index('gid', inplace = True)
MB_recording.head()



CPU times: user 7.13 s, sys: 2.34 s, total: 9.47 s
Wall time: 18.6 s


Unnamed: 0,gid
0,0f42ab32-22cd-4dcf-927b-a8d9a183d68b
1,4dce8f93-45ee-4573-8558-8cd321256233
2,48fabe3f-0fbd-4145-a917-83d164d6386f
3,b30b9943-9100-4d84-9ad2-69859ea88fbb
4,b55f1db3-c6d2-4645-b908-03e1017a99c2


In [6]:
%%time
# Loading redirect table SQL table

MB_redirects = pd.read_sql('select r.gid, rgr.gid from recording r join recording_gid_redirect rgr on rgr.new_id = r.id', con = conn)
MB_redirects.columns = ['old-rec-gid', 'new-rec-gid']
MB_redirects.set_index('old-rec-gid', inplace=True)
MB_redirects.head()



CPU times: user 1.3 s, sys: 161 ms, total: 1.46 s
Wall time: 7.01 s


Unnamed: 0_level_0,new-rec-gid
old-rec-gid,Unnamed: 1_level_1
4672ac68-27ac-4843-9c19-d283018a7c17,cd24bef4-d513-4e42-8cd4-0b8be1e654b7
6db9ab2f-1bb7-42bb-a1cf-62935e906c2a,f2b7197a-d85b-4d26-ae51-2f0826ac69c7
7068ff7e-5dc2-49a7-b918-45d35c24cec4,d404340f-87a2-4626-ac4a-2451bfa04387
14005a4e-54f3-4547-8f78-e264344efe6f,d07f465a-883a-4574-ac4c-fc51ba8679ab
a5c5ede0-b38e-468c-a2f9-874392f72080,272c5b16-c08c-4de8-863f-45e2784d77f1


In [7]:
%%time
# Loading track table

MB_track = pd.read_sql('SELECT gid FROM track', con = conn)
MB_track.columns = ['track-gid']
MB_track.set_index('track-gid', inplace=True)
MB_track.head()



CPU times: user 9.39 s, sys: 3.19 s, total: 12.6 s
Wall time: 24.9 s


9b02977e-a03b-4a6b-a9a9-06e722bdcd7a
43da7544-6283-3159-84f9-537fe823a1a7
0b6b6283-a5a8-4560-9fa8-f68a430d86ea
fa124f9a-d8ea-36a3-bed3-c817fdbe13e2
e56c6d3c-09cf-33a0-81c5-ceade77c35dc


In [8]:

%%time
# Loading canonical recording mbid table

MB_canonical = pd.read_sql('SELECT recording_mbid, canonical_recording_mbid FROM mapping.canonical_recording_redirect', con = conn)
MB_canonical.columns = ['recording-gid', 'canonical-recording-gid']
MB_canonical.set_index('recording-gid', inplace=True)
MB_canonical.head()



CPU times: user 2.28 s, sys: 337 ms, total: 2.62 s
Wall time: 5.99 s


Unnamed: 0_level_0,canonical-recording-gid
recording-gid,Unnamed: 1_level_1
6ac02452-ee12-4f86-b389-bd20ba2fefcf,3e8eebfd-7613-4b3d-acbe-41709be76618
b4c26989-1b9e-4d50-8cde-56d6472e4bc3,3e8eebfd-7613-4b3d-acbe-41709be76618
601e1cf3-ad6c-4e38-9128-ba4d0d4b010f,b1050d12-b8af-409c-9cff-22759d93e240
35c4d840-e51f-4c07-9418-af9335b29642,f4680747-bf28-417a-ab33-af00577d8ac2
9ba7a9b9-a21c-4b12-8771-4c108b08b3e2,13b3875a-c89a-4be5-a6e4-0ca9164bc41d


## Architechture:

1. Take a chunk of MBIDs (Test optimal chunk sizes too. Current optimal > 300k rows)
2. Only get unique values from this series
3. Pass it thorugh the following:
    - check_in_recording()
    - Get mbids that don't exist in recording, and pass it through check_in_redirect()
    - Get mbids that don't exist in redirect, and pass it through check_in_track()

In [50]:
'''
# Problem Statement:
1. Take in a series of recording-MBIDs
2. Mass query the series items in MB_recording table
3. Get a boolean map for series_of_mbids to check if each MBID exists in the MB table.
'''

# To Do: Mod the function to only take in & return unique mbids.

def check_in_recording(series_of_mbids):
    # Takes a series, removes duplicate values, turns back into series.
    series_of_unique_mbids = pd.Series(series_of_mbids.unique())
    
    # Queries all mbids in the recording table. Returns mbids that are present in recording table.
    mbids_in_recording = MB_recording.gid[MB_recording.gid.isin(series_of_unique_mbids)]
    
    # Makes a boolean map for all mbids in series_of_unique_mbids
    bool_map_recording = series_of_unique_mbids.isin(mbids_in_recording)
    
    bool_map_recording_nonunique = series_of_mbids.isin(mbids_in_recording)
    
    # Returns a series of boolean values corresponding the the fullsized series_of_mbids
    return bool_map_recording_nonunique
    # return MB_recording.gid[MB_recording.gid.isin(series_of_mbids)]

In [54]:
%%time
# Testing check_recording_existance()

input = pd.Series(df['recording-MBID'].unique())
output = check_in_recording(input)

output.head()
print("Queried 382674 rows in ~2s")

Queried 382674 rows in ~2s
CPU times: user 2.13 s, sys: 911 µs, total: 2.13 s
Wall time: 2.13 s
