# MusicBrainz 1st Mismatches

In [1]:
import ast
import json
import sys
import urllib
import requests
import csv
import re

import numpy as np
import pandas as pd

PATH_TO_UTILS = "."
sys.path.append(PATH_TO_UTILS)

from utils import check_mf_formatting
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm import tqdm

In [2]:
def extract_q_values_from_csv(file_path):
    column1 = []
    with open(file_path, newline='') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)
        for row in csvreader:
            column1.append(row[0])
    column1 = column1[1:]
    return column1

def get_guid_for_property(q_value, property_id):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{q_value}.json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if "entities" in data and q_value in data["entities"]:
            entity_data = data["entities"][q_value]
            if "claims" in entity_data and property_id in entity_data["claims"]:
                statement = entity_data["claims"][property_id][0]
                return statement["id"]
    return None

def get_artist_id(wikidata_id):
    wikidata_url = "https://www.wikidata.org/w/api.php"
    params = {"action": "wbgetentities", "ids": wikidata_id, "format": "json"}
    response = requests.get(wikidata_url, params=params)
    wikidata_data = response.json()
    try:
        artist_id = wikidata_data["entities"][wikidata_id]["claims"]["P434"][0]["mainsnak"]["datavalue"]["value"]
        return artist_id
    except KeyError:
        return None
    
def fetch_entity_data(entity_id):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{entity_id}.json"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print("Error fetching data:", response.status_code)
        return None

def fetch_artist_data(artist_id):
    url = f"https://musicbrainz.org/ws/2/artist/{artist_id}?fmt=json"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print("Error fetching data:", response.status_code)
        return None

def extract_date_from_wikidata(entity_id):
    wikidata_entity = fetch_entity_data(entity_id)
    if wikidata_entity and "entities" in wikidata_entity and entity_id in wikidata_entity["entities"]:
        entity_info = wikidata_entity["entities"][entity_id]
        if "claims" in entity_info and "P570" in entity_info["claims"]:
            inception_date_claim = entity_info["claims"]["P570"][0]
            if "mainsnak" in inception_date_claim and "datavalue" in inception_date_claim["mainsnak"]:
                inception_date = inception_date_claim["mainsnak"]["datavalue"]["value"]["time"]
                year = inception_date[1:5]
                month = inception_date[6:8]
                day = inception_date[9:11]
                return f"{year}-{month}-{day}"
    return None

In [3]:
mismatches = []
q_values = extract_q_values_from_csv("mb_germany.csv")
for q_value in tqdm(q_values, desc="Processing", total=5041):
    artist_id = get_artist_id(q_value)
    musicbrainz_artist = fetch_artist_data(artist_id)
    wikidata_date = extract_date_from_wikidata(q_value)
    if wikidata_date is None or musicbrainz_artist is None:
        continue
    if "life-span" in musicbrainz_artist and musicbrainz_artist["life-span"]["end"] is not None and len(musicbrainz_artist["life-span"]["end"]) >= 10:
        musicbrainz_date = musicbrainz_artist["life-span"]["end"][:10]
        if wikidata_date != musicbrainz_date:
            link = "https://musicbrainz.org/artist/" + artist_id
            mismatch_finder_entry = {
                "item_id": q_value,
                "statement_guid": get_guid_for_property(q_value, "P570"),
                "property_id": "P570",
                "wikidata_value": wikidata_date,
                "meta_wikidata_value": "Q1985727",
                "external_value": musicbrainz_date,
                "external_url": link,
                "type": "statement"
            }
            mismatches.append(mismatch_finder_entry)
    elif "life-span" in musicbrainz_artist and musicbrainz_artist["life-span"]["end"] is not None:
        musicbrainz_date = musicbrainz_artist["life-span"]["end"][:4]
        wikidata_date = wikidata_date[:4]
        if wikidata_date != musicbrainz_date:
            link = "https://musicbrainz.org/artist/" + artist_id
            mismatch_finder_entry = {
                "item_id": q_value,
                "statement_guid": get_guid_for_property(q_value, "P570"),
                "property_id": "P570",
                "wikidata_value": wikidata_date,
                "meta_wikidata_value": "Q1985727",
                "external_value": musicbrainz_date,
                "external_url": link,
                "type": "statement"
            }
            mismatches.append(mismatch_finder_entry)
    else:
        continue

file_name = 'mb_germany_mm.csv'
header = ["item_id", "statement_guid", "property_id", "wikidata_value", "meta_wikidata_value", "external_value", "external_url", "type"]
with open(file_name, 'w', newline='') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=header)
    writer.writeheader()
    for row in mismatches:
        writer.writerow(row)

Processing:  22%|██▏       | 1112/5041 [14:14<1:01:10,  1.07it/s]

Error fetching data: 503


Processing: 100%|█████████▉| 5039/5041 [1:05:21<00:01,  1.37it/s]

Error fetching data: 503


Processing: 100%|█████████▉| 5040/5041 [1:05:22<00:00,  1.29it/s]


In [4]:
musicbrainz_first_mismatches = pd.read_csv("mb_germany_mm.csv")
check_mf_formatting(musicbrainz_first_mismatches)

All checks have passed! The data is ready to be uploaded to Mismatch Finder.
