In [1]:
import psycopg2
import pandas as pd
import numpy as np

In [2]:
# Create Connection to postgresql
# psql -h localhost -p 25432 -U musicbrainz musicbrainz_db

conn = psycopg2.connect(
    host="localhost",
    database="musicbrainz_db",
    user="musicbrainz",
    port=5432)

cursor = conn.cursor()

cursor.execute("select * from recording limit 1;")
cursor.fetchone()[0]
print("connection successful")

connection successful


In [3]:
# Reads a list of file paths and reads + compiles data into a single pd.DataFrame
def read_files(file_path_repo):
    
    # init new empty main dataframe
    df = pd.DataFrame(columns = ['timestamp', 'artist-MBID', 'release-MBID', 'recording-MBID'])
    
    # Open a file with MLHD file paths to process
    with open(file_path_repo, 'r') as f:
        file_paths = f.readlines()
        file_paths= [item.strip() for item in file_paths]
    
    # Read files and compile into single df
    for pth in file_paths:
        temp = pd.read_csv(pth, sep='\t', names=['timestamp', 'artist-MBID', 'release-MBID', 'recording-MBID'])
        temp = temp[-temp['recording-MBID'].isna()]

        df = pd.concat([df, temp])
    
    return df

In [5]:
%%time
df = read_files('random_file_paths.txt')
print(df.shape)
df.head()

(1985787, 4)
CPU times: user 3.78 s, sys: 371 ms, total: 4.15 s
Wall time: 4.15 s


Unnamed: 0,timestamp,artist-MBID,release-MBID,recording-MBID
1,1348268938,275d1fca-22e8-46b9-85e6-c3523098a599,,b2fa9a1a-1d71-4256-83b1-8705be6387ce
4,1348268136,,,c857137c-b09b-4ed5-8006-961624e27854
6,1348247430,09887aa7-226e-4ecc-9a0c-02d2ae5777e1,4df061b7-db29-4be1-ae26-bdb8c3866925,7460e38b-f28e-4919-a17b-a4f4c19caea2
7,1348247205,0a832a68-88ec-455f-8b53-d9e14838202f,5157b818-6106-3d12-bc9a-4c2b15e80764,c9f20069-38c3-4988-9ee1-213c0c67b1d4
10,1348246437,8559df00-6489-4e23-83dd-d43e3ec4745c,20480e5c-369a-442f-b39f-9c14a83a4cdf,150cdcfd-c63f-4aa4-995a-2b3687abf4f8


In [6]:
%%time
# Loading recording table SQL tables

MB_recording = pd.read_sql('SELECT gid FROM recording', con = conn, columns = ['rec-gid'])
# MB_recording.set_index('gid', inplace = True)
MB_recording.head()



CPU times: user 6.91 s, sys: 2.54 s, total: 9.46 s
Wall time: 18.5 s


Unnamed: 0,gid
0,0f42ab32-22cd-4dcf-927b-a8d9a183d68b
1,4dce8f93-45ee-4573-8558-8cd321256233
2,48fabe3f-0fbd-4145-a917-83d164d6386f
3,b30b9943-9100-4d84-9ad2-69859ea88fbb
4,b55f1db3-c6d2-4645-b908-03e1017a99c2


In [7]:
%%time
# Loading redirect table SQL table

MB_redirects = pd.read_sql('select r.gid, rgr.gid from recording r join recording_gid_redirect rgr on rgr.new_id = r.id', con = conn)
MB_redirects.columns = ['old-rec-gid', 'new-rec-gid']
MB_redirects.set_index('old-rec-gid', inplace=True)
MB_redirects.head()



CPU times: user 1.26 s, sys: 149 ms, total: 1.41 s
Wall time: 9.81 s


Unnamed: 0_level_0,new-rec-gid
old-rec-gid,Unnamed: 1_level_1
dfbefafa-a7dc-4024-8a28-be15537591e9,c4793e5b-5825-4221-893d-8b3776289127
6fdc7ef3-19ea-421d-9246-243a3c9b193a,041199ca-09bd-4dfc-90a5-ec12fee79cba
aceb2a1c-c696-4948-8f98-e1690212f2c3,3455438b-8105-4cff-a08d-a13da4df20f1
2251ff3f-05ac-4dff-b90f-fefb309026d5,3c79aeca-5747-4c62-b108-059369e0078c
ea2ac89a-d33a-41a6-982b-9da7eff77147,673a0c86-5af1-4311-b6d6-42d37afd95e5


In [8]:
%%time
# Loading track table

MB_track = pd.read_sql('SELECT gid FROM track', con = conn)
MB_track.columns = ['track-gid']
MB_track.set_index('track-gid', inplace=True)
MB_track.head()



CPU times: user 9.52 s, sys: 2.84 s, total: 12.4 s
Wall time: 25.2 s


9b02977e-a03b-4a6b-a9a9-06e722bdcd7a
43da7544-6283-3159-84f9-537fe823a1a7
0b6b6283-a5a8-4560-9fa8-f68a430d86ea
fa124f9a-d8ea-36a3-bed3-c817fdbe13e2
e56c6d3c-09cf-33a0-81c5-ceade77c35dc


In [9]:

%%time
# Loading canonical recording mbid table

MB_canonical = pd.read_sql('SELECT recording_mbid, canonical_recording_mbid FROM mapping.canonical_recording_redirect', con = conn)
MB_canonical.columns = ['recording-gid', 'canonical-recording-gid']
MB_canonical.set_index('recording-gid', inplace=True)
MB_canonical.head()



CPU times: user 2.22 s, sys: 329 ms, total: 2.55 s
Wall time: 5.82 s


Unnamed: 0_level_0,canonical-recording-gid
recording-gid,Unnamed: 1_level_1
6ac02452-ee12-4f86-b389-bd20ba2fefcf,3e8eebfd-7613-4b3d-acbe-41709be76618
b4c26989-1b9e-4d50-8cde-56d6472e4bc3,3e8eebfd-7613-4b3d-acbe-41709be76618
601e1cf3-ad6c-4e38-9128-ba4d0d4b010f,b1050d12-b8af-409c-9cff-22759d93e240
35c4d840-e51f-4c07-9418-af9335b29642,f4680747-bf28-417a-ab33-af00577d8ac2
9ba7a9b9-a21c-4b12-8771-4c108b08b3e2,13b3875a-c89a-4be5-a6e4-0ca9164bc41d


## Architechture:

1. Take a chunk of MBIDs (Test optimal chunk sizes too. Current optimal > 300k rows)
2. Only get unique values from this series
3. Pass it thorugh the following:
    - check_in_recording()
    - Get mbids that don't exist in recording, and pass it through check_in_redirect()
    - Get mbids that don't exist in redirect, and pass it through check_in_track()
    - get mbids that don't exist in track, and add them to a series of unknown mbids.

In [10]:
'''
# Problem Statement:
1. Take in a series of recording-MBIDs
2. Mass query the series items in MB_recording table
3. Get a boolean map for series_of_mbids to check if each MBID exists in the MB table.
'''

def check_in_recording(series_of_mbids):
    
    # Queries all mbids in the recording table. Returns mbids that are present in recording table.
    mbids_in_recording = MB_recording.gid[MB_recording.gid.isin(series_of_mbids)]
    
    # Makes a boolean map for all mbids in series_of_unique_mbids
    bool_map_recording = series_of_mbids.isin(mbids_in_recording)
    
    # Returns a series of boolean values corresponding to series_of_mbids 
    # (bool specified if value exists in recording table or not.)
    return bool_map_recording
    # return MB_recording.gid[MB_recording.gid.isin(series_of_mbids)]

In [11]:
%%time
# Testing check_in_recording()

input = pd.Series(df['recording-MBID'].unique())
output = check_in_recording(input)

output.head()

CPU times: user 2.87 s, sys: 0 ns, total: 2.87 s
Wall time: 2.86 s


0     True
1    False
2     True
3     True
4     True
dtype: bool

In [10]:
# A generic function for queries a series into another series.
    # Returns a series of boolean values corresponding to series_of_mbids 
    # (bool specifies if value exists in recording table or not.)

def query_in(series_to_query, series_to_query_in):
    # Queries all mbids in the recording table. Returns mbids that are present in recording table.
    mbids_in_series = series_to_query_in[series_to_query_in.isin(series_to_query)]
    
    # Makes a boolean map for all mbids in series_of_unique_mbids
    bool_map = series_to_query.isin(mbids_in_series)
    
    return bool_map
    # return MB_recording.gid[MB_recording.gid.isin(series_of_mbids)]

In [12]:
%%time
# Testing query_in() with recording MBID

in_for_rec = pd.Series(df['recording-MBID'].unique())
out_for_rec = query_in(in_for_rec, MB_recording.gid)

out_for_rec.head()

CPU times: user 3.69 s, sys: 0 ns, total: 3.69 s
Wall time: 3.69 s


0     True
1     True
2    False
3    False
4     True
dtype: bool

In [13]:
%%time

in_for_tack = in_for_rec[-out_for_rec]
out_for_track = query_in(in_for_rec, MB_track)

out_for_track.value_counts()

CPU times: user 7.49 s, sys: 0 ns, total: 7.49 s
Wall time: 7.5 s


False    253218
dtype: int64

In [14]:
%%time

in_for_redir = in_for_rec[out_for_rec]
# in_for_redir = in_for_rec

out_for_redir = query_in(in_for_redir, MB_redirects.index)
out_for_redir.value_counts()

CPU times: user 799 ms, sys: 0 ns, total: 799 ms
Wall time: 794 ms


False    157985
True      39496
dtype: int64