## Imports

In [72]:
import sys

In [73]:
PATH_TO_UTILS = "../../"
sys.path.append(PATH_TO_UTILS)

In [74]:
import pandas as pd

In [75]:
import json

In [76]:
from utils import check_mf_formatting

In [77]:
from datetime import datetime

In [78]:
import math

## Wikidata Data Loading

In [79]:
f = open("wikidata_data_dobs.json", "r")
wikidata_players_dobs = json.load(f)
f.close()

In [80]:
f = open("wikidata_titles.json", "r")
wikidata_players_titles = json.load(f)
f.close()

In [81]:
wikidata_players_dobs[0]

{'item': 'http://www.wikidata.org/entity/Q66850',
 'itemLabel': 'David Baramidze',
 'fideID': '4667719',
 'dateOfBirth': '1988-09-27T00:00:00Z',
 'dobStatementGUID': 'q66850-0808D72D-7D0F-4F17-814F-F4D77C633ACE'}

In [82]:
wikidata_players_titles[0]

{'item': 'http://www.wikidata.org/entity/Q92207',
 'itemLabel': 'Jörg Hickl',
 'fide_id': '4600118',
 'titleLabel': 'Grandmaster',
 'title_guid': 'Q92207-676BBD54-C3E3-47DF-89CA-0BDF64F2FE04',
 'title_start_time': '1988-01-01T00:00:00Z'}

## XML Parsing

In [83]:
def xml_to_dict(xml_file):
    # Initialize an empty dictionary to store player data.
    players_data = {}

    # Initialize variables to keep track of current player attributes.
    current_player = {}

    # Parse the XML file incrementally with iterparse.
    context = ET.iterparse(xml_file, events=("start", "end"))

    # Wrap the context with tqdm to create a progress bar.
    context = tqdm(context, desc="Parsing XML", unit="elements", leave=False)

    for event, elem in context:
        if event == "start" and elem.tag == "player":
            # Start of a player element, reset current_player dictionary.
            current_player = {}

        elif event == "end" and elem.tag == "player":
            # End of a player element, add current_player to players_data dictionary.
            player_id = current_player.get("fideid")
            if player_id:
                players_data[player_id] = current_player

        elif event == "end":
            # Process the end of an XML element inside a player element.
            current_player[elem.tag] = (
                elem.text.strip() if elem.text is not None else None
            )
            elem.clear()

    # Close the XML file.
    del context

    return players_data


xml_file = "players_list_xml_foa.xml"
fide_dict = xml_to_dict(xml_file)

                                                           

In [84]:
len(fide_dict)

1390912

In [85]:
fide_dict["1503014"]

{'fideid': '1503014',
 'name': 'Carlsen, Magnus',
 'country': 'NOR',
 'sex': 'M',
 'title': 'GM',
 'w_title': None,
 'o_title': None,
 'foa_title': None,
 'rating': '2830',
 'games': '0',
 'k': '10',
 'rapid_rating': '2823',
 'rapid_games': '0',
 'rapid_k': '10',
 'blitz_rating': '2886',
 'blitz_games': '0',
 'blitz_k': '10',
 'birthday': '1990',
 'flag': None}

In [86]:
wikidata_players_dobs[90]

{'item': 'http://www.wikidata.org/entity/Q98881',
 'itemLabel': 'Reinhart Fuchs',
 'fideID': '4611357',
 'dateOfBirth': '1934-09-28T00:00:00Z',
 'dobStatementGUID': 'Q98881-540E8F90-B4F0-4222-AD2E-958E7394F8B7'}

### Date of Birth Mismatches

In [87]:
# Does Not Take Into Account Deprecated Values

count = 0
f = open("invalid_fide.txt", "w")
f1 = open("incomplete_data.txt", "w")
lst = []
lst2 = []
for e in wikidata_players_dobs:
    try:
        if e["dateOfBirth"][:4] != fide_dict[e["fideID"]]["birthday"]:

            if fide_dict[e["fideID"]]["birthday"] is None:
                f1.write("Birthday not found on FIDE for FIDE id %s\n" % e["fideID"])
                f1.write("WIKIDATA: %s\n" % e["item"])
                continue
            external_url = "https://ratings.fide.com/profile/" + e["fideID"]
            row = {
                "item_id": e["item"][31:],
                "property_id": "P569",
                "statement_guid": e["dobStatementGUID"],
                "wikidata_value": e["dateOfBirth"],
                "meta_wikidata_value": "Q1985727",
                "external_value": fide_dict[e["fideID"]]["birthday"],
                "external_url": external_url,
                "type": "statement",
            }
            # df = df.append(row, ignore_index = True).
            lst.append(row)
            count += 1

    except KeyError as an_error:
        if an_error.args[0] == e["fideID"]:
            f.write("FIDE id not found %s\n" % e["fideID"])
            f.write("WIKIDATA: %s\n" % e["item"])

f.close()
f1.close()

df = pd.DataFrame(
    lst,
    columns=[
        "item_id",
        "statement_guid",
        "property_id",
        "wikidata_value",
        "meta_wikidata_value",
        "external_value",
        "external_url",
        "type",
    ],
)

In [88]:
df

Unnamed: 0,item_id,statement_guid,property_id,wikidata_value,meta_wikidata_value,external_value,external_url,type
0,Q77168,Q77168-37C631A8-7BE2-414A-A806-585C1BEE1EFB,P569,1971-05-01T00:00:00Z,Q1985727,1979,https://ratings.fide.com/profile/1004816,statement
1,Q103301,Q103301-1342BD85-9F9B-4795-B4D3-D5E9BEFE1B23,P569,1943-04-17T00:00:00Z,Q1985727,1945,https://ratings.fide.com/profile/4600185,statement
2,Q278844,Q278844-13624996-30D9-43C7-80D7-98294FB512D3,P569,1949-08-24T00:00:00Z,Q1985727,1946,https://ratings.fide.com/profile/900095,statement
3,Q326562,Q326562-16F44782-9AD3-4774-8E60-97C8B6F84F7F,P569,1942-01-01T00:00:00Z,Q1985727,1943,https://ratings.fide.com/profile/800139,statement
4,Q446773,Q446773-34830D20-DCDE-45D6-B865-38E55604179B,P569,1976-12-05T00:00:00Z,Q1985727,2008,https://ratings.fide.com/profile/15201759,statement
...,...,...,...,...,...,...,...,...
115,Q101530051,Q101530051-1232CC02-8C2E-4C6E-9F30-C2F52A9C84C4,P569,1941-01-01T00:00:00Z,Q1985727,1942,https://ratings.fide.com/profile/34161322,statement
116,Q115464159,Q115464159-2e278e84-4b29-5a27-679a-c76a6686e465,P569,1982-12-04T00:00:00Z,Q1985727,1994,https://ratings.fide.com/profile/3926370,statement
117,Q117225388,Q117225388-444e7389-43ef-3610-43f8-9ec6d50b14b5,P569,2001-01-01T00:00:00Z,Q1985727,2000,https://ratings.fide.com/profile/21873763,statement
118,Q122417673,Q122417673-9F8C8499-4F19-4246-B297-D5B7EF5D55BB,P569,1997-03-28T00:00:00Z,Q1985727,2000,https://ratings.fide.com/profile/13509403,statement


In [89]:
check_mf_formatting(df)

All checks have passed! The data is ready to be uploaded to Mismatch Finder.


In [90]:
df.to_csv("validated_players_data.csv", index=False)

### Chess Title Mismatches

In [91]:
# Only considers mismatches per item

In [92]:
full_forms_dict = {'Grandmaster':'GM', 'Woman Grandmaster': 'WGM', 'International Master':'IM', 'Woman International Master':'WIM', 'FIDE Master':'FM', 'Woman FIDE Master': 'WFM', 'Candidate Master':'CM', 'Woman Candidate Master':'WCM'}

In [93]:
wd_titles_df = pd.DataFrame(wikidata_players_titles)
wd_titles_df.set_index('fide_id', inplace=True)

In [94]:
wd_titles_df.groupby(['titleLabel'])['titleLabel'].value_counts()
#['titleLabel']

titleLabel
Candidate Master               2969
FIDE Master                   10642
Grandmaster                    1828
International Master           5748
Woman Candidate Master         1212
Woman FIDE Master              2368
Woman Grandmaster               506
Woman International Master     1335
Name: count, dtype: int64

In [95]:
wd_titles_df = wd_titles_df.fillna('')

In [96]:
wd_titles_df.loc['3836517']

item                      http://www.wikidata.org/entity/Q125258192
itemLabel                               Laura Sofia Huayhuas Robles
titleLabel                                   Woman Candidate Master
title_guid          Q125258192-6BD7FD5A-D92C-4EED-8CA0-4C1BE3146750
title_start_time                               2024-01-01T00:00:00Z
Name: 3836517, dtype: object

In [97]:
wd_titles_df.loc['1741721']

Unnamed: 0_level_0,item,itemLabel,titleLabel,title_guid,title_start_time
fide_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1741721,http://www.wikidata.org/entity/Q108377878,William Olsson,FIDE Master,Q108377878-C8CFB2A3-7FF9-456D-8981-2F1072B272F3,2021-08-01T00:00:00Z
1741721,http://www.wikidata.org/entity/Q108377878,William Olsson,International Master,Q108377878-be69ba7f-427b-948c-c77c-899037e3c433,2024-01-31T00:00:00Z


In [98]:
count = 0
lst = []
lst2 = []
f = open("invalid_fide2.txt", 'w')
for fid in wd_titles_df.index.unique():
    
    sub_df = wd_titles_df.loc[fid]
    try:
            
        if isinstance(sub_df, pd.DataFrame): # multiple rows
            
            potential_mismatch_indexes = []
            sub_df = sub_df.reset_index()
            for i, s in sub_df.iterrows():
                if (fide_dict[fid]['title'] == None): # FIDE has no title listed
                    if s['title_start_time']: # No title start time on Wikidata
                        if (s['title_start_time'] >= '2024-02-01T00:00:00Z' or # XML was from Feb 2024
                            s['title_start_time'] == '2024-01-01T00:00:00Z'): # Refers to ambiguous "2024" start time
                            continue
    
                    external_url = 'https://ratings.fide.com/profile/' + fid
                    row = {'item_id': s['item'][31:], 
                           'property_id': 'P2962',  
                           'statement_guid': s['title_guid'], 
                           'wikidata_value': s['titleLabel'], 
                           'meta_wikidata_value': None, 
                           'external_value': fide_dict[fid]['title'], 
                           'external_url': external_url, 
                           'type':'statement'}
                    lst.append(row)
                    continue
                    
                    
                if (full_forms_dict[s['titleLabel']] != fide_dict[fid]['title']): # potential mismatch!
                    if s['title_start_time']:
                        if (s['title_start_time'] >= '2024-02-01T00:00:00Z' or 
                            s['title_start_time'] == '2024-01-01T00:00:00Z'): 
                            break
                    potential_mismatch_indexes.append(i)
                else:
                    break # no real mismatch - move on to next fid
            else: # no match found within sub_df
                for i in potential_mismatch_indexes:
                    
                    external_url = 'https://ratings.fide.com/profile/' + fid
                    row = {'item_id': sub_df.iloc[i]['item'][31:], 
                           'property_id': 'P2962', 
                           'statement_guid': sub_df.iloc[i]['title_guid'], 
                           'wikidata_value': sub_df.iloc[i]['titleLabel'], 
                           'meta_wikidata_value': None, 
                           'external_value': fide_dict[fid]['title'], 
                           'external_url': external_url, 
                           'type':'statement'}
                    lst.append(row)
        
        else:
            
            # sub_df is just a series
                
            if (fide_dict[fid]['title'] == None):
                if sub_df['title_start_time']:
                    if (sub_df['title_start_time'] >= '2024-02-01T00:00:00Z' or 
                        sub_df['title_start_time'] == '2024-01-01T00:00:00Z'): 
                        continue
                external_url = 'https://ratings.fide.com/profile/' + fid
                row = {'item_id': sub_df['item'][31:], 
                       'property_id': 'P2962', 
                       'statement_guid': sub_df['title_guid'], 
                       'wikidata_value': sub_df['titleLabel'], 
                       'meta_wikidata_value': None, 
                       'external_value': fide_dict[fid]['title'], 
                       'external_url': external_url, 
                       'type':'statement'}
                lst.append(row)
                continue

            if (full_forms_dict[sub_df['titleLabel']] != fide_dict[fid]['title']): # mismatch
                if sub_df['title_start_time']:
                    if (sub_df['title_start_time'] >= '2024-02-01T00:00:00Z' or 
                        sub_df['title_start_time'] == '2024-01-01T00:00:00Z'): continue

                external_url = 'https://ratings.fide.com/profile/' + fid
                row = {'item_id': sub_df['item'][31:], 
                       'property_id': 'P2962', 
                       'statement_guid': sub_df['title_guid'], 
                       'wikidata_value': sub_df['titleLabel'], 
                       'meta_wikidata_value': None, 
                       'external_value': fide_dict[fid]['title'], 
                       'external_url': external_url, 
                       'type':'statement'}
                lst.append(row)
            
    except KeyError as an_error: 
        if (an_error.args[0] == fid):
            f.write("FIDE id not found %s\n" % fid)
            f.write("WIKIDATA: %s\n" % sub_df['item'])

df = pd.DataFrame(lst, columns=["item_id", "statement_guid", "property_id", "wikidata_value", "meta_wikidata_value", "external_value", "external_url", "type"])

f.close()



In [99]:
df

Unnamed: 0,item_id,statement_guid,property_id,wikidata_value,meta_wikidata_value,external_value,external_url,type
0,Q211194,Q211194-CAE3E2B5-9512-4972-B450-18E9B3810516,P2962,International Master,,,https://ratings.fide.com/profile/1201271,statement
1,Q111461115,Q111461115-18AD690E-38B4-4346-A3E2-0D2C37DD0130,P2962,Candidate Master,,FM,https://ratings.fide.com/profile/44523793,statement
2,Q111461361,Q111461361-586260B3-0A62-4791-90F8-FAC778FA6F3E,P2962,Woman Candidate Master,,WFM,https://ratings.fide.com/profile/2134470,statement
3,Q113393895,Q113393895-FAC6DC01-8863-4AA8-A5F5-C7239FC4C423,P2962,Candidate Master,,FM,https://ratings.fide.com/profile/30940192,statement
4,Q115556931,Q115556931-E2EA9F78-E479-431C-B788-7BFDF3D7FD17,P2962,Candidate Master,,FM,https://ratings.fide.com/profile/35066188,statement
...,...,...,...,...,...,...,...,...
154,Q71318493,Q71318493-4538C3C9-735B-435A-A92F-20EA55B51E1C,P2962,FIDE Master,,IM,https://ratings.fide.com/profile/6367879,statement
155,Q102659968,Q102659968-002a75f4-4049-f7ed-5f6f-0de36cf4b2e5,P2962,FIDE Master,,WFM,https://ratings.fide.com/profile/34104868,statement
156,Q108407169,Q108407169-6EEF76C3-B108-4376-9EC1-4FA3E4395AD0,P2962,FIDE Master,,IM,https://ratings.fide.com/profile/30931975,statement
157,Q108419448,Q108419448-9CC93D97-FBFD-4DDC-A34A-737A806EFD90,P2962,Candidate Master,,FM,https://ratings.fide.com/profile/22272836,statement


In [100]:
df['statement_guid'] = df['statement_guid'].str.replace('-', '$', 1)

In [101]:
df['external_value'] = df['external_value'].fillna('')

In [102]:
check_mf_formatting(df)

All checks have passed! The data is ready to be uploaded to Mismatch Finder.


In [103]:
df.to_csv('validated_titles_data.csv', index=False)