# MusicBrainz 2nd Mismatches

In [1]:
# All of the necessary imports needed
import ast
import json
import sys
import urllib
import requests
import csv
import re
import time

import numpy as np
import pandas as pd

PATH_TO_UTILS = "."
sys.path.append(PATH_TO_UTILS)

from utils import check_mf_formatting
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm import tqdm

In [1]:
# This function gets each individual column from the csv file
def extract_data(file_path):
    column1 = []
    column2 = []
    column3 = []
    with open(file_path, newline='') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)
        for row in csvreader:
            column1.append(row[0])
            column2.append(row[1])
            column3.append(row[2])
    return column1, column2, column3
    
# This function accesses the Wikidata rest API
def fetch_entity_data(entity_id):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{entity_id}.json"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print("Error fetching data:", response.status_code)
        return None

# This function access the MusicBrainz rest API
def fetch_artist_data(artist_id):
    url = f"https://musicbrainz.org/ws/2/artist/{artist_id}?fmt=json"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print("Error fetching data:", response.status_code)
        return None

# This function gets all of the P569 values from the Wikidata rest API
def extract_date_from_wikidata(entity_id):
    wikidata_entity = fetch_entity_data(entity_id)
    if wikidata_entity and "entities" in wikidata_entity and entity_id in wikidata_entity["entities"]:
        entity_info = wikidata_entity["entities"][entity_id]
        if "claims" in entity_info and "P569" in entity_info["claims"]:
            birth_dates = []
            for claim in entity_info["claims"]["P569"]:
                if "mainsnak" in claim and "datavalue" in claim["mainsnak"]:
                    birth_date = claim["mainsnak"]["datavalue"]["value"]["time"]
                    year = birth_date[1:5]
                    month = birth_date[6:8]
                    day = birth_date[9:11]
                    birth_dates.append(f"{year}-{month}-{day}")
            return birth_dates
    return None

# This function returns the GUID for mismatches
def get_guid_for_property(q_value, property_id):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{q_value}.json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if "entities" in data and q_value in data["entities"]:
            entity_data = data["entities"][q_value]
            if "claims" in entity_data and property_id in entity_data["claims"]:
                statement = entity_data["claims"][property_id][0]
                return statement["id"]
    return None

In [3]:
mismatches = [] # Used to store all mismatches found
q_values, artist_ids, wikidata_times = extract_data("mb_italy.csv")
for q_value, artist_id, wikidata_time in tqdm(zip(q_values, artist_ids, wikidata_times), desc="Processing", total=5409):
    musicbrainz_artist = fetch_artist_data(artist_id)
    time.sleep(0.25) # Needed to meet the MB API limits
    if musicbrainz_artist is None: # Skips over non-existent MB artists
        continue
    # Accounts for month, day, and year
    if "life-span" in musicbrainz_artist and musicbrainz_artist["life-span"]["begin"] is not None and len(musicbrainz_artist["life-span"]["begin"]) >= 10:
        musicbrainz_date = musicbrainz_artist["life-span"]["begin"][:10]
        wikidata_dates = extract_date_from_wikidata(q_value)
        if wikidata_dates is None:
            continue
        match_found = False
        for wikidata_date in wikidata_dates:
            # Not a mismatch if one of the wikidata_dates matches the musicbrainz_date
            if wikidata_date == musicbrainz_date:
                match_found = True
                break
        # A mismatch if none of the wikidata_dates matches the musicbrainz_date
        if not match_found:
            link = "https://musicbrainz.org/artist/" + artist_id
            mismatch_finder_entry = {
                "item_id": q_value,
                "statement_guid": get_guid_for_property(q_value, "P569"),
                "property_id": "P569",
                "wikidata_value": wikidata_time[:10],
                "meta_wikidata_value": "Q1985727",
                "external_value": musicbrainz_date,
                "external_url": link,
                "type": "statement"
            }
            mismatches.append(mismatch_finder_entry)
    # Accounts for just year
    elif "life-span" in musicbrainz_artist and musicbrainz_artist["life-span"]["begin"] is not None:
        musicbrainz_date = musicbrainz_artist["life-span"]["begin"][:4]
        wikidata_dates = extract_date_from_wikidata(q_value)
        if wikidata_dates is None:
            continue
        match_found = False
        for wikidata_date in wikidata_dates:
            # Not a mismatch if one of the wikidata_dates matches the musicbrainz_date
            if wikidata_date[:4] == musicbrainz_date:
                match_found = True
                break
        # A mismatch if none of the wikidata_dates matches the musicbrainz_date
        if not match_found:
            link = "https://musicbrainz.org/artist/" + artist_id
            mismatch_finder_entry = {
                "item_id": q_value,
                "statement_guid": get_guid_for_property(q_value, "P569"),
                "property_id": "P569",
                "wikidata_value": wikidata_time[:4],
                "meta_wikidata_value": "Q1985727",
                "external_value": musicbrainz_date,
                "external_url": link,
                "type": "statement"
            }
            mismatches.append(mismatch_finder_entry)
    else:
        continue

# Code to write in the mismatches 
file_name = 'mb_italy_mm.csv'
header = ["item_id", "statement_guid", "property_id", "wikidata_value", "meta_wikidata_value", "external_value", "external_url", "type"]
with open(file_name, 'w', newline='') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=header)
    writer.writeheader()
    for row in mismatches:
        writer.writerow(row)

Processing:   2%|▏         | 106/5409 [01:31<1:12:34,  1.22it/s]

Error fetching data: 404


Processing:   9%|▉         | 494/5409 [07:01<1:14:28,  1.10it/s]

Error fetching data: 404


Processing:  24%|██▍       | 1314/5409 [19:29<56:09,  1.22it/s]  

Error fetching data: 404


Processing:  25%|██▌       | 1359/5409 [20:07<56:08,  1.20it/s]  

Error fetching data: 404


Processing:  58%|█████▊    | 3142/5409 [46:15<35:44,  1.06it/s]  

Error fetching data: 404


Processing:  60%|██████    | 3247/5409 [47:48<33:30,  1.08it/s]

Error fetching data: 404


Processing:  64%|██████▍   | 3460/5409 [50:52<27:07,  1.20it/s]

Error fetching data: 404


Processing:  72%|███████▏  | 3902/5409 [57:12<21:04,  1.19it/s]

Error fetching data: 404


Processing:  80%|████████  | 4332/5409 [1:03:23<16:53,  1.06it/s]

Error fetching data: 404


Processing:  80%|████████  | 4348/5409 [1:03:37<15:30,  1.14it/s]

Error fetching data: 404


Processing:  94%|█████████▍| 5109/5409 [1:14:37<05:10,  1.04s/it]

Error fetching data: 404


Processing: 100%|██████████| 5409/5409 [1:18:52<00:00,  1.14it/s]


In [4]:
# Checks to see if the mismatch file is formatted correctly
musicbrainz_first_mismatches = pd.read_csv("mb_italy_mm.csv")
check_mf_formatting(musicbrainz_first_mismatches)

All checks have passed! The data is ready to be uploaded to Mismatch Finder.
