# Clean Fide Mismatches

Notebook to check the generated FIDE mismatches. What we're looking for is:
- Those that are for Wikidata items that have a profession other than chess player ([Q10873124](https://www.wikidata.org/wiki/Q10873124))
- For the others, check for entries that have a date of birth that is not the one on FIDE

In [1]:
# pip install jupyter-black

In [2]:
%load_ext jupyter_black

## Imports and Load Data

In [2]:
import requests

import pandas as pd
from tqdm.auto import tqdm

In [12]:
def rest_api_get_request(wd_qid: str, term: str = ""):
    api_endpoint = "https://www.wikidata.org/w/rest.php/wikibase/v0"
    request_string = api_endpoint + "/entities/items/" + f"{wd_qid}"
    if term != "":
        request_string += f"/{term}"

    request = requests.get(request_string)

    return request.json()

In [7]:
df_fide_mismatches = pd.read_csv("validated_players_data.csv")
len(df_fide_mismatches)

120

In [9]:
df_fide_mismatches.head()

Unnamed: 0,item_id,statement_guid,property_id,wikidata_value,meta_wikidata_value,external_value,external_url,type
0,Q77168,Q77168-37C631A8-7BE2-414A-A806-585C1BEE1EFB,P569,1971-05-01T00:00:00Z,Q1985727,1979,https://ratings.fide.com/profile/1004816,statement
1,Q103301,Q103301-1342BD85-9F9B-4795-B4D3-D5E9BEFE1B23,P569,1943-04-17T00:00:00Z,Q1985727,1945,https://ratings.fide.com/profile/4600185,statement
2,Q278844,Q278844-13624996-30D9-43C7-80D7-98294FB512D3,P569,1949-08-24T00:00:00Z,Q1985727,1946,https://ratings.fide.com/profile/900095,statement
3,Q326562,Q326562-16F44782-9AD3-4774-8E60-97C8B6F84F7F,P569,1942-01-01T00:00:00Z,Q1985727,1943,https://ratings.fide.com/profile/800139,statement
4,Q446773,Q446773-34830D20-DCDE-45D6-B865-38E55604179B,P569,1976-12-05T00:00:00Z,Q1985727,2008,https://ratings.fide.com/profile/15201759,statement


In [36]:
# We have some duplicate IDs.
len(set(df_fide_mismatches["item_id"]))

118

## Derive Occupation and DOB Data

In [56]:
OCCUPATION_PID = "P106"
DATA_OF_BIRTH_PID = "P569"

CHESS_PLAYER_QID = "Q10873124"

In [21]:
test_occupations = [
    r["value"]["content"]
    for r in rest_api_get_request(wd_qid="Q77168", term="statements")[OCCUPATION_PID]
]
test_occupations

['Q10873124']

In [26]:
test_dobs = [
    int(r["value"]["content"]["time"].replace("+", "").split("-")[0])
    for r in rest_api_get_request(wd_qid="Q77168", term="statements")[DATA_OF_BIRTH_PID]
]
test_dobs

[1971, 1979]

In [32]:
qid_to_occupation_dob_dict = {}

for qid in tqdm(df_fide_mismatches["item_id"], desc="QIDs", unit="qids"):
    qid_statements = rest_api_get_request(wd_qid=qid, term="statements")

    if OCCUPATION_PID in qid_statements:
        qid_occupation = [r["value"]["content"] for r in qid_statements[OCCUPATION_PID]]

    qid_years_of_birth = [
        int(r["value"]["content"]["time"].replace("+", "").split("-")[0])
        for r in qid_statements[DATA_OF_BIRTH_PID]
    ]

    qid_to_occupation_dob_dict[qid] = {}

    if OCCUPATION_PID in qid_statements:
        qid_to_occupation_dob_dict[qid]["occupations"] = qid_occupation
    qid_to_occupation_dob_dict[qid]["years_of_birth"] = qid_years_of_birth

QIDs:   0%|          | 0/120 [00:00<?, ?qids/s]

In [35]:
len(qid_to_occupation_dob_dict)

118

## Find QIDs to Upload

In [34]:
first_dict_value = next(iter(qid_to_occupation_dob_dict.values()))
first_dict_value

{'occupations': ['Q10873124'], 'years_of_birth': [1971, 1979]}

In [44]:
qid_to_occupation_dob_dict["Q77168"]

{'occupations': ['Q10873124'], 'years_of_birth': [1971, 1979]}

In [65]:
qids_to_keep = []

for i in df_fide_mismatches.index:
    if (
        "occupations"
        in qid_to_occupation_dob_dict[df_fide_mismatches.loc[i, "item_id"]]
    ):
        occupations = qid_to_occupation_dob_dict[df_fide_mismatches.loc[i, "item_id"]][
            "occupations"
        ]

    years_of_birth = qid_to_occupation_dob_dict[df_fide_mismatches.loc[i, "item_id"]][
        "years_of_birth"
    ]

    # Keep based on occupation if there's more than one or only one and it's not chess player.
    if (
        "occupations"
        in qid_to_occupation_dob_dict[df_fide_mismatches.loc[i, "item_id"]]
    ):
        if len(occupations) > 1 or (
            len(occupations) == 1 and occupations[0] != CHESS_PLAYER_QID
        ):
            qids_to_keep.append(df_fide_mismatches.loc[i, "item_id"])

    # Keep based on year of birth if the value on FIDE doesn't match one of the Wikidata years.
    if df_fide_mismatches.loc[i, "external_value"] not in years_of_birth:
        qids_to_keep.append(df_fide_mismatches.loc[i, "item_id"])

qids_to_keep = list(set(qids_to_keep))

In [66]:
len(qids_to_keep)

68

In [67]:
qids_to_keep[:5]

['Q1657668', 'Q20561970', 'Q4363644', 'Q71312184', 'Q16541128']

In [68]:
df_fide_mismatches[df_fide_mismatches["item_id"].isin(qids_to_keep[:5])]

Unnamed: 0,item_id,statement_guid,property_id,wikidata_value,meta_wikidata_value,external_value,external_url,type
17,Q1657668,Q1657668-EB12A11F-E30B-43E3-BE5A-45DA9E5A2258,P569,1938-05-27T00:00:00Z,Q1985727,1937,https://ratings.fide.com/profile/4103017,statement
20,Q4363644,Q4363644-B40B42BD-F8C4-4491-BA18-7EAE26260F9B,P569,1961-10-12T00:00:00Z,Q1985727,1964,https://ratings.fide.com/profile/4101731,statement
24,Q16541128,Q16541128-BFB14768-AC95-4FD9-881B-71914837D05C,P569,1952-04-06T00:00:00Z,Q1985727,1953,https://ratings.fide.com/profile/408786,statement
34,Q20561970,Q20561970-4d549c05-45d5-f4de-c9ce-077fd2a6087b,P569,1946-01-01T00:00:00Z,Q1985727,1948,https://ratings.fide.com/profile/233862,statement
97,Q71312184,Q71312184-CEF657AC-9558-4849-8440-90A63E993F3A,P569,1980-01-01T00:00:00Z,Q1985727,1987,https://ratings.fide.com/profile/2271192,statement


In [71]:
df_fide_mismatches_to_upload = df_fide_mismatches[
    df_fide_mismatches["item_id"].isin(qids_to_keep)
]

In [74]:
df_fide_mismatches_to_upload.to_csv(
    "validated_players_data_upload.csv", encoding="utf-8", index=False
)

## Fixing GUID Values

In [3]:
df_fide_mismatches_to_upload = pd.read_csv(
    "validated_players_data_upload.csv"
)

In [5]:
df_fide_mismatches_to_upload["statement_guid"] = df_fide_mismatches_to_upload[
    "statement_guid"
].str.replace("-", "$", 1)

In [7]:
df_fide_mismatches_to_upload.to_csv(
    "validated_players_data_upload.csv", encoding="utf-8", index=False
)