Data taken from: [Kensho Derived Wikimedia Dataset](https://www.kaggle.com/datasets/kenshoresearch/kensho-derived-wikimedia-data)

Potential topics:
 - [astronomical object](https://www.wikidata.org/wiki/Q17444909)
 - [astronomy](https://www.wikidata.org/wiki/Q333)
 - [planetary science](https://www.wikidata.org/wiki/Q104499)


In [1]:
from pathlib import Path

import networkx as nx
import numpy as np
import pandas as pd


In [2]:
!du -h -d 1 data/archive/* | sort -hr


'du' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
DATA_DIR = Path("data/archive")

In [4]:
def get_mem_usage(df):
    return round(df.memory_usage(index=True, deep=True).sum() / 1024 ** 2)

# Properties



In [5]:
p_df = pd.read_csv(DATA_DIR / "property.csv")
p_df = p_df[(p_df["property_id"] == 31) | (p_df["property_id"] == 279)] # instance of, subclass of
p_df

Unnamed: 0,property_id,en_label,en_description
15,31,instance of,that class of which this subject is a particul...
170,279,subclass of,all instances of these items are instances of ...


# Statements

In [6]:
qpq_df = pd.read_csv(DATA_DIR / "statements.csv", dtype=np.uint32)
qpq_df

Unnamed: 0,source_item_id,edge_property_id,target_item_id
0,1,31,36906466
1,1,279,3695190
2,1,398,497745
3,1,398,1133705
4,1,398,1139177
...,...,...,...
141206848,77257484,59,9286
141206849,77257491,31,318
141206850,77257491,59,9286
141206851,77257493,31,318


In [7]:
get_mem_usage(qpq_df) # 1616 MB

1616

In [8]:
qpq_df.max(axis=0)

source_item_id      77257493
edge_property_id        7643
target_item_id      77219312
dtype: uint32

In [9]:
## keep cases were it's an instance of / is a sublcass of

qpq_df = qpq_df.loc[
    (qpq_df["edge_property_id"] == 31) | (qpq_df["edge_property_id"] == 279)
].drop_duplicates()
qpq_df


Unnamed: 0,source_item_id,edge_property_id,target_item_id
0,1,31,36906466
1,1,279,3695190
37,2,31,3504248
95,3,31,937228
96,3,279,203872
...,...,...,...
141206843,77257472,31,318
141206845,77257483,31,318
141206847,77257484,31,318
141206849,77257491,31,318


In [10]:
# items = pd.read_csv(DATA_DIR / "item.csv")

In [11]:
# items.query("item_id == 1")

In [12]:
# items.query("item_id == 36906466")

In [13]:
# items.query("item_id == 3695190")[["en_label", "en_description"]]


In [14]:
get_mem_usage(qpq_df)  # 529 MB


529

# Subclass Graphs

In [15]:
import pickle

In [16]:
#property_graph = nx.DiGraph()

#property_graph.add_edges_from(
#    qpq_df[["source_item_id", "target_item_id"]].values
#)


In [17]:
#with open('property_graph_all.pickle', 'wb') as handle:
 #   pickle.dump(property_graph, handle)

In [18]:
with open('property_graph_all.pickle', 'rb') as handle:
    property_graph = pickle.load(handle)

In [None]:
# Earth <-- instance of <-- planet <-- instance of <-- astronomical object

In [20]:
root_qids = {
    "astronomical_object_type": 17444909,  # https://www.wikidata.org/wiki/Q17444909 (astronomical object type)
    "astronomy": 333, # https://www.wikidata.org/wiki/Q333 (astronomy)
    "planetary_science": 104499, # https://www.wikidata.org/wiki/Q104499 (planetary science)
} 


In [33]:
qpq_df[qpq_df["target_item_id"] == 333]

Unnamed: 0,source_item_id,edge_property_id,target_item_id
28574,411,279,333
171634,6440,279,333
579453,37547,279,333
844317,58908,279,333
2190193,181505,279,333
2216883,184274,279,333
4132588,430570,31,333
4793398,502038,279,333
5338468,560575,279,333
5771900,622752,279,333


In [21]:
subclass_qids = {
    lbl: set(nx.ancestors(property_graph, qid)).union(set([qid]))
    for lbl, qid in root_qids.items()
}

# Subclass Signatures

In [22]:
df = pd.DataFrame(index=set(qpq_df.index))
df.index.name = 'qid'

In [23]:
df.head()

0
1
67108865
67108866
67108867


In [34]:
qpq_signature_dfs = {}
#mask1 = qpq_df["edge_property_id"] == 31  ### orig
mask1 = qpq_df['edge_property_id']==(31 or 279) ## alt

### P31 : that class of which this subject is a particular example and member - is a
for lbl, qid in root_qids.items():
    mask2 = qpq_df["target_item_id"].isin(subclass_qids[lbl])
    qpq_signature_dfs[lbl] = qpq_df[mask1 & mask2][["source_item_id", "target_item_id"]]

    qpq_signature_dfs[lbl].set_index("source_item_id", drop=True, inplace=True)
    qpq_signature_dfs[lbl].index.name = "qid"

    # de-duplicate index
    qpq_signature_dfs[lbl] = qpq_signature_dfs[lbl][~qpq_signature_dfs[lbl].index.duplicated()]

    # add to dataframe
    df[lbl] = qpq_signature_dfs[lbl]["target_item_id"]


In [36]:
df

Unnamed: 0_level_0,astronomical_object_type,astronomy
qid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,,
1,,
67108865,,
67108866,,
67108867,,
...,...,...
134217714,,
67108860,,
67108861,,
67108862,,


In [37]:
df = df.fillna(0).astype(np.int)
df

Unnamed: 0_level_0,astronomical_object_type,astronomy
qid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,0
1,0,0
67108865,0,0
67108866,0,0
67108867,0,0
...,...,...
134217714,0,0
67108860,0,0
67108861,0,0
67108862,0,0


In [39]:
df[df["astronomical_object_type"] > 0]

Unnamed: 0_level_0,astronomical_object_type,astronomy
qid,Unnamed: 1_level_1,Unnamed: 2_level_1
193,121750,0
4202,17444909,0
6458,17444909,0
13766,523,0
13768,523,0
...,...,...
66758466,3863,0
66827948,83373,0
66932863,44559,0
66956019,523,0


## output

In [None]:
print(df.shape)
print("old filtering would have given:", df.loc[(df['org'] > 0) |(df['state'] > 0) |(df['loc'] > 0) |(df['per'] > 0)  ].shape[0])

In [None]:
df = df.loc[(df>0).any(1)]  ## is any value over 0 
print(df.shape)

In [None]:
df.to_csv("wikidata_ner_entities_v2.csv.gz",index=True,compression="gzip")