# Kensho Dervied Wikimedia Dataset (KDWD) - Wikidata Small Ontology

Let's try and create a small number of classes for Wikidata items that are person, place, state, orghanization (or company?) , so we can filter wikidata items by it 

In [18]:
from pathlib import Path

import networkx as nx
import numpy as np
import pandas as pd


In [16]:
MIN_STATEMENTS = 1

In [9]:
!du -h -d 1 data/archive/* | sort -hr


17G	data/archive/link_annotated_text.jsonl
3.9G	data/archive/item.csv
2.5G	data/archive/statements.csv
210M	data/archive/page.csv
193M	data/archive/item_aliases.csv
688K	data/archive/property.csv
200K	data/archive/property_aliases.csv


In [2]:
DATA_DIR = Path("data/archive")

In [3]:
def get_mem_usage(df):
    return round(df.memory_usage(index=True, deep=True).sum() / 1024 ** 2)

# Properties



In [7]:
p_df = pd.read_csv(DATA_DIR / "property.csv")
p_df = p_df[(p_df["property_id"] == 31) | (p_df["property_id"] == 279)] # instance of, subclass of
p_df

Unnamed: 0,property_id,en_label,en_description
15,31,instance of,that class of which this subject is a particul...
170,279,subclass of,all instances of these items are instances of ...


# Statements

In [11]:
qpq_df = pd.read_csv(DATA_DIR / "statements.csv", dtype=np.uint32)
qpq_df

Unnamed: 0,source_item_id,edge_property_id,target_item_id
0,1,31,36906466
1,1,279,3695190
2,1,398,497745
3,1,398,1133705
4,1,398,1139177
...,...,...,...
141206848,77257484,59,9286
141206849,77257491,31,318
141206850,77257491,59,9286
141206851,77257493,31,318


In [12]:
get_mem_usage(qpq_df) # 1616 MB

1616

In [13]:
qpq_df.max(axis=0)

source_item_id      77257493
edge_property_id        7643
target_item_id      77219312
dtype: uint32

In [14]:
## keep cases were it's an instance of / is a

qpq_df = qpq_df.loc[
    (qpq_df["edge_property_id"] == 31) | (qpq_df["edge_property_id"] == 279)
].drop_duplicates()
qpq_df


Unnamed: 0,source_item_id,edge_property_id,target_item_id
0,1,31,36906466
1,1,279,3695190
37,2,31,3504248
95,3,31,937228
96,3,279,203872
...,...,...,...
141206843,77257472,31,318
141206845,77257483,31,318
141206847,77257484,31,318
141206849,77257491,31,318


In [15]:
get_mem_usage(qpq_df)  # 529 MB


529

# Subclass Graphs

In [20]:
%%time
p279g = nx.DiGraph()

p279g.add_edges_from(
    qpq_df[qpq_df["edge_property_id"] == 279][["source_item_id", "target_item_id"]].values
)


CPU times: user 9.33 s, sys: 675 ms, total: 10 s
Wall time: 10 s


In [22]:
root_qids = {
    "per": 5,  # https://www.wikidata.org/wiki/Q5  human
    "loc": 2221906,  # https://www.wikidata.org/wiki/Q2221906  geographic location
    "org": 43229,  # https://www.wikidata.org/wiki/Q43229  organization # 	social entity (not necessarily commercial) uni..
    "state": 7275,  # https://www.wikidata.org/wiki/Q7275  state
}


In [23]:
subclass_qids = {
    lbl: set(nx.ancestors(p279g, qid)).union(set([qid]))
    for lbl, qid in root_qids.items()
}

# Subclass Signatures

In [None]:
df = pd.DataFrame(index=keep_source_item_ids) ## orig
# df = pd.DataFrame(index=set(qpq_df.index))
df.index.name = 'qid'

In [None]:
qpq_signature_dfs = {}
mask1 = qpq_df['edge_property_id']==31 ### orig
# mask1 = qpq_df['edge_property_id']==(31 or 279) ## alt 

### P31 : that class of which this subject is a particular example and member - is a
for lbl, qid in root_qids.items():
    mask2 = qpq_df['target_item_id'].isin(subclass_qids[lbl])
    qpq_signature_dfs[lbl] = qpq_df[mask1 & mask2][['source_item_id', 'target_item_id']]
    
    qpq_signature_dfs[lbl].set_index('source_item_id', drop=True, inplace=True)
    qpq_signature_dfs[lbl].index.name = 'qid'
    
    # de-duplicate index 
    qpq_signature_dfs[lbl] = qpq_signature_dfs[lbl][~qpq_signature_dfs[lbl].index.duplicated()]
    
    # add to dataframe
    df[lbl] = qpq_signature_dfs[lbl]['target_item_id']

In [None]:
df = df.fillna(0).astype(np.int)
df

## output

In [None]:
print(df.shape)
print("old filtering would have given:", df.loc[(df['org'] > 0) |(df['state'] > 0) |(df['loc'] > 0) |(df['per'] > 0)  ].shape[0])

In [None]:
df = df.loc[(df>0).any(1)]  ## is any value over 0 
print(df.shape)

In [None]:
df.to_csv("wikidata_ner_entities_v2.csv.gz",index=True,compression="gzip")