# Mix'n'match Mismatch Generation

This notebook is used to genrate mismatches for [Mismatch Finder](https://www.wikidata.org/wiki/Wikidata:Mismatch_Finder) via a request to [Mix'n'match](https://meta.wikimedia.org/wiki/Mix%27n%27match) data stores. Data will be formatted for upload given the [directions for creating a mismatch file](https://github.com/wmde/wikidata-mismatch-finder/blob/main/docs/UserGuide.md#creating-a-mismatches-import-file).

In [1]:
#!pip install jupyter-black
#!pip install tensorflow
#!pip install aiohttp

In [2]:
# %load_ext jupyter_black

In [3]:
import asyncio
import json
import ssl
import sys
import urllib

import aiohttp
import numpy as np
import pandas as pd
from tqdm import tqdm

import nest_asyncio
nest_asyncio.apply()

PATH_TO_UTILS = "../"  # change based on your directory structure
sys.path.append(PATH_TO_UTILS)

from utils import check_mf_formatting

## Get data

In [4]:
mnm_mismatch_request_url = (
    "https://mix-n-match.toolforge.org/api.php?query=all_issues&mode=time_mismatch"
)

In [5]:
with urllib.request.urlopen(mnm_mismatch_request_url) as url:
    mnm_mismatch_data = json.load(url)

In [6]:
mnm_mismatch_data_expanded = []
for d in mnm_mismatch_data["data"]:
    d["source"] = f"https://mix-n-match.toolforge.org/#/entry/{d['entry_id']}"
    d.pop("issue_id", None)
    d["time_mismatch"]["pid"] = d["time_mismatch"].pop("prop")
    d["time_mismatch"]["qid"] = d["time_mismatch"].pop("q")
    d["item_id"] = d["time_mismatch"]["qid"]

    mnm_mismatch_data_expanded.append(d)

## Explore data

In [7]:
print(f"{len(mnm_mismatch_data['data']):,}")

82,996


In [8]:
mnm_mismatch_data["data"][:2]

[{'entry_id': '44032422',
  'time_mismatch': {'wd_time': '+1925-01-01T00:00:00Z',
   'mnm_time': '+1926-07-04T00:00:00Z',
   'pid': 'P569',
   'qid': 'Q329124'},
  'source': 'https://mix-n-match.toolforge.org/#/entry/44032422',
  'item_id': 'Q329124'},
 {'entry_id': '115714460',
  'time_mismatch': {'wd_time': '+1998-09-19T00:00:00Z',
   'mnm_time': '+1987-04-17T00:00:00Z',
   'pid': 'P569',
   'qid': 'Q107654539'},
  'source': 'https://mix-n-match.toolforge.org/#/entry/115714460',
  'item_id': 'Q107654539'}]

In [9]:
mnm_mismatch_data_expanded[:2]

[{'entry_id': '44032422',
  'time_mismatch': {'wd_time': '+1925-01-01T00:00:00Z',
   'mnm_time': '+1926-07-04T00:00:00Z',
   'pid': 'P569',
   'qid': 'Q329124'},
  'source': 'https://mix-n-match.toolforge.org/#/entry/44032422',
  'item_id': 'Q329124'},
 {'entry_id': '115714460',
  'time_mismatch': {'wd_time': '+1998-09-19T00:00:00Z',
   'mnm_time': '+1987-04-17T00:00:00Z',
   'pid': 'P569',
   'qid': 'Q107654539'},
  'source': 'https://mix-n-match.toolforge.org/#/entry/115714460',
  'item_id': 'Q107654539'}]

In [10]:
mnm_mismatch_data_expanded = list(filter(lambda d: d["time_mismatch"]["wd_time"] != d["time_mismatch"]["mnm_time"], mnm_mismatch_data_expanded))
len(mnm_mismatch_data_expanded)

82996

## Sync implementation

In [12]:
acc = []
blank_entry = {"id": np.NAN, "value": {"type": "value", "content": {"time": np.NAN}}}
i = -1
for entry in tqdm(mnm_mismatch_data_expanded):
    i += 1
    data = entry["time_mismatch"]
    req = f"https://www.wikidata.org/w/rest.php/wikibase/v0/entities/items/{entry['item_id']}?_fields=statements"
    try:
        with urllib.request.urlopen(req) as url:
            wd_props = json.load(url)["statements"]
    except urllib.request.HTTPError as e:
        # Fixed in newer version https://stackoverflow.com/questions/67723860/python-urllib-request-urlopen-http-error-308-permanent-redirect.
        print("Skipped", req)
        print(e)
        continue

    with urllib.request.urlopen(f"https://mix-n-match.toolforge.org/api.php?query=get_entry&entry={entry['entry_id']}") as url:
        try:
            ext_url = json.load(url)["data"]["entries"][entry["entry_id"]]["ext_url"]
        except TypeError:
            # Sometimes API can return json.load(url)["data"]["entries"] == [].
            print("Skipping malformatted external URL:", entry["source"], "idx", i)
            continue

    nonnull_wd_vals = wd_props[data["pid"]] if data["pid"] in wd_props else [blank_entry]
    # Sometimes, wikidata has multiple incorrect values, so fix them all.
    for wd_val in nonnull_wd_vals:
        guid = wd_val["id"]

        # Eg: Q62900754 has a death date range, which doesn't play nice, so ignore it.
        if wd_val["value"]["type"] != "value":
            print(f"Skipping GUID {guid} on {entry['item_id']} {data['pid']} because it doesn't have a concrete value")
            continue

        wikidata_value = wd_val["value"]["content"]["time"]

        # Isn't actually a mismatch.
        if (wikidata_value == data["mnm_time"]):
            continue

        acc.append({
            "item_id": entry["item_id"],
            "statement_guid": guid,
            "property_id": data["pid"],
            "wikidata_value": wikidata_value,
            "meta_wikidata_value": np.NAN,
            "external_value": data["mnm_time"],
            "external_url": ext_url,
            "type": "statement",
        })

100%|██████████| 1/1 [00:00<00:00,  3.89it/s]


## Async implementation
(currently too fast)

In [None]:
acc = []

async def fetch(session, entry):
    data = entry["time_mismatch"]
    req = f"https://www.wikidata.org/w/rest.php/wikibase/v0/entities/items/{entry['item_id']}?_fields=statements"
    async with session.get(req, ssl=ssl.SSLContext()) as response:
        try:
            wd_props = (await response.json())["statements"]

        except:
            print(await response.text())

        pbar.update(0.5)

        url = f"https://mix-n-match.toolforge.org/api.php?query=get_entry&entry={entry['entry_id']}"
        async with session.get(url, ssl=ssl.SSLContext()) as response:
            ext_url = (await response.json())["data"]["entries"][entry["entry_id"]]["ext_url"]
            pbar.update(0.5)

            acc.append({
                "item_id": entry["item_id"],
                "statement_guid": wd_props[data["pid"]][0]["id"],
                "property_id": data["pid"],
                "wikidata_value": data["wd_time"],
                "meta_wikidata_value": np.NAN,
                "external_value": data["mnm_time"],
                "external_url": ext_url,
                "type": "statement",
            })

            return True


async def fetch_all(urls, loop):
    async with aiohttp.ClientSession(loop=loop) as session:  # read_timeout=None
        results = await asyncio.gather(*[fetch(session, url) for url in urls], return_exceptions=True)
        return results


if __name__ == "__main__":
    pbar = tqdm(total=len(mnm_mismatch_data_expanded))
    loop = asyncio.get_event_loop()
    status = loop.run_until_complete(fetch_all(mnm_mismatch_data_expanded, loop))
    pbar.close()

In [None]:
nn = list(filter(lambda x: x is not None, acc))
nn[2]

## Saving

In [14]:
mismatchDF = pd.DataFrame(acc)
mismatchDF

Unnamed: 0,item_id,statement_guid,property_id,wikidata_value,meta_wikidata_value,external_value,external_url,type
0,Q3081659,,P569,,,+1923-08-20T00:00:00Z,https://catalogue.bnf.fr/ark:/12148/cb135956276,statement


In [None]:
mismatchDF.to_csv("mismatches.csv", index=False)

## Review check of data

The original csv was saved without `index=False`, so we need to reload and resave it as a column for the index was created that's invalid for Mismatch Finder uploads.

In [None]:
mismatchDF = pd.read_csv("mismatches.csv", index_col=0)
print(len(mismatchDF))
mismatchDF.head()

In [None]:
check_mf_formatting(mismatchDF)