# rec_track_checker
- **Objectives**:
    - Loop through ALL MLHD files.
    - Check recording_mbid column for presence of any track_gid or track_gid_redirect
    - Re-write recently read chunk in CSV+ZSTD with compression lvl 10

In [44]:
import os
import time
import pandas as pd
import lib.mb as mb
import lib.load as load
import json

### Loading Files

In [2]:
# Fetching file locations
MLHD_ROOT = "/data/mlhd"
mlhd_files = []

for root, dirs, files in os.walk(MLHD_ROOT):
    for file in files:
        if file.endswith(".gz"):
            mlhd_files.append(os.path.join(root, file))
    
total_files = len(mlhd_files)

In [3]:
%%time
# Fetching relevant tables

MB_track = mb.get_tracks()
print("MB_track loaded")
MB_track.head()

MB_track loaded
CPU times: user 2min 19s, sys: 5.69 s, total: 2min 25s
Wall time: 2min 38s


Unnamed: 0,gid
0,9b02977e-a03b-4a6b-a9a9-06e722bdcd7a
1,43da7544-6283-3159-84f9-537fe823a1a7
2,0b6b6283-a5a8-4560-9fa8-f68a430d86ea
3,fa124f9a-d8ea-36a3-bed3-c817fdbe13e2
4,e56c6d3c-09cf-33a0-81c5-ceade77c35dc


In [4]:
%%time
# Fetching relevant tables

MB_track_redir = mb.get_track_redirects_old()
print("MB_track_redirects_old loaded")
MB_track_redir.head()

MB_track_redirects_old loaded
CPU times: user 1.12 s, sys: 48.1 ms, total: 1.16 s
Wall time: 1.34 s


Unnamed: 0,gid
0,d8abbf14-5945-3639-a0c9-3e9c70b1c0a4
1,446b27ef-92fd-3ea1-ad0a-fe1055196406
2,403b9e19-a135-3acc-ae69-58fb0d735036
3,b61dde50-de25-3802-b706-dd0d490879ae
4,13bee970-0129-320d-94c8-dfbc7748c692


In [5]:
def load_path(file_path):
    """Function to load a file and return a dataframe"""

    
    df = pd.read_csv(
        file_path, sep='\t',
        header=None,
        names = ['timestamp', 'artist_MBID', 'release_MBID', 'recording_MBID'],
        dtype={'artist_MBID': str, 'release_MBID': str, 'recording_MBID': str}
        )
    df.drop(df[df['recording_MBID'].isna()].index, inplace=True)

    return df

In [6]:
%%time
df_test = load_path(mlhd_files[50])
df_test

CPU times: user 88.7 ms, sys: 4.21 ms, total: 92.9 ms
Wall time: 122 ms


Unnamed: 0,timestamp,artist_MBID,release_MBID,recording_MBID
0,1169940836,18870405-17e8-42a7-8a4c-7c79b432019c,df0f8865-3770-4b65-b3f8-007c4d624e8c,c010d736-58c4-4860-be74-b2a231ce4830
1,1169941232,3132dae9-9eff-4805-ae2e-fada1af87b76,755f05f7-b994-4d15-8aa7-46c2461fcae8,250762b8-9037-41e4-8371-15002da794f1
2,1169941456,0a3dda11-e89e-42f1-8356-b7a6e5f68424,b0dd105a-e38f-366a-91e2-9b9b776d8c13,772e9d19-dd5c-4594-9c65-5588fe40223a
3,1169941664,20883363-1ea4-4d72-ad72-c0e767038f3e,a6c76014-bf81-3b15-adf5-c4175ea42c0b,2611cee8-7ebb-4bb4-8771-5172e88fa87e
4,1169942224,20883363-1ea4-4d72-ad72-c0e767038f3e,a6c76014-bf81-3b15-adf5-c4175ea42c0b,0af519f0-8326-490c-a57d-96114ed7a0fc
...,...,...,...,...
66115,1367236434,850a90fe-ea6a-4527-be57-5f440c88c1ef,c040773a-9768-4328-8095-e5c08bec91d2,82c98140-7d13-4787-b90b-71d08aac7792
66116,1367236527,850a90fe-ea6a-4527-be57-5f440c88c1ef,c040773a-9768-4328-8095-e5c08bec91d2,c3777740-483f-4e55-9562-9a191d55f30a
66119,1367242923,c1e98e4a-4628-4c89-a7a6-0e0171600b05,f0056c90-c5d7-488d-8377-dfb00a30591f,afbfad99-dd2e-40e2-84f6-205db42b8fae
66122,1367622661,429f9fbf-0a0b-4a9e-a88c-94d48aa466ed,c8207f15-25b4-4b6e-b4b6-2912725a1fb1,2c33a77a-86df-4f24-b9de-a4b127edd375


In [7]:
# df_test_positive = pd.DataFrame({
#     'timestamp': [None, None, None, None, None],
#     'artist_MBID': [None, None, None, None, None],
#     'release_MBID': [None, None, None, None, None], 
#     'recording_MBID': [None, None, None, None, None]})

# Taking a slice from original dataframe
df_test_positive = df_test.dropna().iloc[:6, :]
print("Before:\n", df_test_positive.recording_MBID.values)

df_test_positive.iloc[:4, 3] = pd.concat([MB_track.gid.iloc[:2 ], MB_track_redir.gid.iloc[:2]])

# In this data:
# First 2 rows are from MB_track
# Next 2 rows are from MB_track_redirects_old
# Last 2 rows are from MB_track

df_test_positive

Before:
 ['c010d736-58c4-4860-be74-b2a231ce4830'
 '250762b8-9037-41e4-8371-15002da794f1'
 '772e9d19-dd5c-4594-9c65-5588fe40223a'
 '2611cee8-7ebb-4bb4-8771-5172e88fa87e'
 '0af519f0-8326-490c-a57d-96114ed7a0fc'
 '42b91da5-1508-4d64-a40a-83597177c7e2']


Unnamed: 0,timestamp,artist_MBID,release_MBID,recording_MBID
0,1169940836,18870405-17e8-42a7-8a4c-7c79b432019c,df0f8865-3770-4b65-b3f8-007c4d624e8c,9b02977e-a03b-4a6b-a9a9-06e722bdcd7a
1,1169941232,3132dae9-9eff-4805-ae2e-fada1af87b76,755f05f7-b994-4d15-8aa7-46c2461fcae8,43da7544-6283-3159-84f9-537fe823a1a7
2,1169941456,0a3dda11-e89e-42f1-8356-b7a6e5f68424,b0dd105a-e38f-366a-91e2-9b9b776d8c13,d8abbf14-5945-3639-a0c9-3e9c70b1c0a4
3,1169941664,20883363-1ea4-4d72-ad72-c0e767038f3e,a6c76014-bf81-3b15-adf5-c4175ea42c0b,446b27ef-92fd-3ea1-ad0a-fe1055196406
4,1169942224,20883363-1ea4-4d72-ad72-c0e767038f3e,a6c76014-bf81-3b15-adf5-c4175ea42c0b,0af519f0-8326-490c-a57d-96114ed7a0fc
5,1169942482,20883363-1ea4-4d72-ad72-c0e767038f3e,a6c76014-bf81-3b15-adf5-c4175ea42c0b,42b91da5-1508-4d64-a40a-83597177c7e2


## Testing Files

In [43]:
%%time
def check_rec(df):
    """
    Function to check if a track is in the recording table
    Returns a dataframe with tracks that are in recording table.
    """    
    
    in_track = df.recording_MBID.isin(MB_track.gid)
    in_redir = df.recording_MBID.isin(MB_track_redir.gid)
    
    if True in in_track.values or True in in_redir.values:
        return df[in_track | in_redir]
    else:
        return None

    # return (in_track | in_redir)

test_output = check_rec(df_test_positive)
test_output

CPU times: user 18.2 s, sys: 176 ms, total: 18.3 s
Wall time: 18.3 s


Unnamed: 0,timestamp,artist_MBID,release_MBID,recording_MBID
0,1169940836,18870405-17e8-42a7-8a4c-7c79b432019c,df0f8865-3770-4b65-b3f8-007c4d624e8c,9b02977e-a03b-4a6b-a9a9-06e722bdcd7a
1,1169941232,3132dae9-9eff-4805-ae2e-fada1af87b76,755f05f7-b994-4d15-8aa7-46c2461fcae8,43da7544-6283-3159-84f9-537fe823a1a7
2,1169941456,0a3dda11-e89e-42f1-8356-b7a6e5f68424,b0dd105a-e38f-366a-91e2-9b9b776d8c13,d8abbf14-5945-3639-a0c9-3e9c70b1c0a4
3,1169941664,20883363-1ea4-4d72-ad72-c0e767038f3e,a6c76014-bf81-3b15-adf5-c4175ea42c0b,446b27ef-92fd-3ea1-ad0a-fe1055196406


In [41]:
# output_log = {
#     'path': [], 
#     'logs': []
#     }

def log_output(output_df, path, master_log_dict):
    """
    Function to log output dataframe and path
    """
    
    # Converts df to dictionary for ease of access.
    output_df = output_df.to_dict()
    try:
        master_log_dict['path'].append(path)
        master_log_dict['logs'].append(output_df)
    except:
        raise Exception("Provide a valid dictionary for logging")
    
    return master_log_dict

In [40]:
# Test for log_output
output_log = {
    'path': [], 
    'logs': []
    }

log_output(test_output, mlhd_files[50], output_log)

{'path': ['/data/mlhd/8e/8e8c1165-2767-4b4e-864b-2410ea136897.txt.gz'],
 'logs': [{'timestamp': {0: 1169940836,
    1: 1169941232,
    2: 1169941456,
    3: 1169941664},
   'artist_MBID': {0: '18870405-17e8-42a7-8a4c-7c79b432019c',
    1: '3132dae9-9eff-4805-ae2e-fada1af87b76',
    2: '0a3dda11-e89e-42f1-8356-b7a6e5f68424',
    3: '20883363-1ea4-4d72-ad72-c0e767038f3e'},
   'release_MBID': {0: 'df0f8865-3770-4b65-b3f8-007c4d624e8c',
    1: '755f05f7-b994-4d15-8aa7-46c2461fcae8',
    2: 'b0dd105a-e38f-366a-91e2-9b9b776d8c13',
    3: 'a6c76014-bf81-3b15-adf5-c4175ea42c0b'},
   'recording_MBID': {0: UUID('9b02977e-a03b-4a6b-a9a9-06e722bdcd7a'),
    1: UUID('43da7544-6283-3159-84f9-537fe823a1a7'),
    2: UUID('d8abbf14-5945-3639-a0c9-3e9c70b1c0a4'),
    3: UUID('446b27ef-92fd-3ea1-ad0a-fe1055196406')}}]}

## Re-Writing

In [13]:
%%time
df_test.to_csv('unk_ids/test.csv.zstd', index=False, compression={'method': 'zstd', 'level': 10})

CPU times: user 313 ms, sys: 23.9 ms, total: 337 ms
Wall time: 335 ms


# Trying in loop

### Algorithm

define output_log_dict

for path in mlhd_files:
    1) Start time log
    1) Read into dataframe df
    2) output = check_rec(df)
    3) log_output(output, path, output_log_dict)
    4) Write original df to txt+zstd
    5) Stop time log
    6) 


### To-Do
Define Global variables for storage paths, etc.

In [14]:
check_rec

In [15]:
mlhd_files[0]
# .split('/')[-1]

'/data/mlhd/8e/8eddc7ca-2be9-4096-9efd-153497990e21.txt.gz'