# MusicBrainz 2nd Mismatches

In [1]:
import ast
import json
import sys
import urllib
import requests
import csv
import re

import numpy as np
import pandas as pd

PATH_TO_UTILS = "."
sys.path.append(PATH_TO_UTILS)

from utils import check_mf_formatting
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm import tqdm
from time import sleep

In [2]:
def extract_q_values_from_csv(file_path):
    column1 = []
    with open(file_path, newline='') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)
        for row in csvreader:
            column1.append(row[0])
    column1 = column1[1:]
    return column1

def get_guid_for_property(q_value, property_id):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{q_value}.json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if "entities" in data and q_value in data["entities"]:
            entity_data = data["entities"][q_value]
            if "claims" in entity_data and property_id in entity_data["claims"]:
                statement = entity_data["claims"][property_id][0]
                return statement["id"]
    return None

def get_artist_id(wikidata_id):
    wikidata_url = "https://www.wikidata.org/w/api.php"
    params = {"action": "wbgetentities", "ids": wikidata_id, "format": "json"}
    response = requests.get(wikidata_url, params=params)
    wikidata_data = response.json()
    try:
        artist_id = wikidata_data["entities"][wikidata_id]["claims"]["P434"][0]["mainsnak"]["datavalue"]["value"]
        return artist_id
    except KeyError:
        return None
    
def fetch_entity_data(entity_id):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{entity_id}.json"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print("Error fetching data:", response.status_code)
        return None

def fetch_artist_data(artist_id):
    url = f"https://musicbrainz.org/ws/2/artist/{artist_id}?fmt=json"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print("Error fetching data:", response.status_code)
        return None

def extract_date_from_wikidata(entity_id):
    wikidata_entity = fetch_entity_data(entity_id)
    if wikidata_entity and "entities" in wikidata_entity and entity_id in wikidata_entity["entities"]:
        entity_info = wikidata_entity["entities"][entity_id]
        if "claims" in entity_info and "P569" in entity_info["claims"]:
            inception_date_claim = entity_info["claims"]["P569"][0]
            if "mainsnak" in inception_date_claim and "datavalue" in inception_date_claim["mainsnak"]:
                inception_date = inception_date_claim["mainsnak"]["datavalue"]["value"]["time"]
                year = inception_date[1:5]
                month = inception_date[6:8]
                day = inception_date[9:11]
                return f"{year}-{month}-{day}"
    return None

In [4]:
mismatches = []
q_values = extract_q_values_from_csv("mb_italy.csv")
for q_value in tqdm(q_values, desc="Processing", total=5409):
    artist_id = get_artist_id(q_value)
    musicbrainz_artist = fetch_artist_data(artist_id)
    wikidata_date = extract_date_from_wikidata(q_value)
    if wikidata_date is None or musicbrainz_artist is None:
        continue
    if "life-span" in musicbrainz_artist and musicbrainz_artist["life-span"]["begin"] is not None and len(musicbrainz_artist["life-span"]["begin"]) >= 10:
        musicbrainz_date = musicbrainz_artist["life-span"]["begin"][:10]
        if wikidata_date != musicbrainz_date:
            link = "https://musicbrainz.org/artist/" + artist_id
            mismatch_finder_entry = {
                "item_id": q_value,
                "statement_guid": get_guid_for_property(q_value, "P569"),
                "property_id": "P569",
                "wikidata_value": wikidata_date,
                "meta_wikidata_value": "Q1985727",
                "external_value": musicbrainz_date,
                "external_url": link,
                "type": "statement"
            }
            mismatches.append(mismatch_finder_entry)
    elif "life-span" in musicbrainz_artist and musicbrainz_artist["life-span"]["begin"] is not None:
        musicbrainz_date = musicbrainz_artist["life-span"]["begin"][:4]
        wikidata_date = wikidata_date[:4]
        if wikidata_date != musicbrainz_date:
            link = "https://musicbrainz.org/artist/" + artist_id
            mismatch_finder_entry = {
                "item_id": q_value,
                "statement_guid": get_guid_for_property(q_value, "P569"),
                "property_id": "P569",
                "wikidata_value": wikidata_date,
                "meta_wikidata_value": "Q1985727",
                "external_value": musicbrainz_date,
                "external_url": link,
                "type": "statement"
            }
            mismatches.append(mismatch_finder_entry)
    else:
        continue

file_name = 'mb_italy_mm.csv'
header = ["item_id", "statement_guid", "property_id", "wikidata_value", "meta_wikidata_value", "external_value", "external_url", "type"]
with open(file_name, 'w', newline='') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=header)
    writer.writeheader()
    for row in mismatches:
        writer.writerow(row)

Processing:   2%|▏         | 106/5409 [01:24<1:07:56,  1.30it/s]

Error fetching data: 404


Processing:  24%|██▍       | 1314/5409 [17:16<56:29,  1.21it/s]  

Error fetching data: 404


Processing:  25%|██▌       | 1359/5409 [17:51<54:41,  1.23it/s]

Error fetching data: 404


Processing:  49%|████▉     | 2668/5409 [34:44<33:34,  1.36it/s]  

Error fetching data: 503


Processing:  49%|████▉     | 2669/5409 [34:45<33:14,  1.37it/s]

Error fetching data: 503


Processing:  58%|█████▊    | 3142/5409 [40:43<28:21,  1.33it/s]

Error fetching data: 404


Processing:  58%|█████▊    | 3152/5409 [40:51<28:34,  1.32it/s]

Error fetching data: 503


Processing:  60%|██████    | 3247/5409 [42:04<27:15,  1.32it/s]

Error fetching data: 404


Processing:  64%|██████▍   | 3460/5409 [44:43<23:53,  1.36it/s]

Error fetching data: 404


Processing:  72%|███████▏  | 3902/5409 [50:12<19:15,  1.30it/s]

Error fetching data: 404


Processing:  73%|███████▎  | 3940/5409 [50:40<17:57,  1.36it/s]

Error fetching data: 503


Processing:  80%|████████  | 4332/5409 [55:43<14:02,  1.28it/s]

Error fetching data: 404


Processing:  80%|████████  | 4348/5409 [55:56<13:30,  1.31it/s]

Error fetching data: 404


Processing:  96%|█████████▋| 5217/5409 [1:07:09<02:17,  1.39it/s]

Error fetching data: 503


Processing: 100%|█████████▉| 5408/5409 [1:09:35<00:00,  1.30it/s]


In [5]:
musicbrainz_second_mismatches = pd.read_csv("mb_italy_mm.csv")
check_mf_formatting(musicbrainz_second_mismatches)

All checks have passed! The data is ready to be uploaded to Mismatch Finder.
