Data taken from: [Kensho Derived Wikimedia Dataset](https://www.kaggle.com/datasets/kenshoresearch/kensho-derived-wikimedia-data)

Potential topics:
 - [astronomical object](https://www.wikidata.org/wiki/Q17444909)
 - [astronomy](https://www.wikidata.org/wiki/Q333)
 - [planetary science](https://www.wikidata.org/wiki/Q104499)


In [2]:
from pathlib import Path
import pickle
import networkx as nx
import numpy as np
import pandas as pd


In [3]:
with open('df_filtered.pickle', 'rb') as handle:
    data_filtered = pickle.load(handle)

In [5]:
data_filtered.head(5)

Unnamed: 0_level_0,Work_of_art
qid,Unnamed: 1_level_1
149,63412991
844,24856
904,21198342
1351,1344
7216,11424


In [2]:
!du -h -d 1 data/archive/* | sort -hr


'du' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
DATA_DIR = Path("data/archive")

In [3]:
def get_mem_usage(df):
    return round(df.memory_usage(index=True, deep=True).sum() / 1024 ** 2)

# Statements

In [4]:
qpq_df = pd.read_csv(DATA_DIR / "statements.csv", dtype=np.uint32)
qpq_df

Unnamed: 0,source_item_id,edge_property_id,target_item_id
0,1,31,36906466
1,1,279,3695190
2,1,398,497745
3,1,398,1133705
4,1,398,1139177
...,...,...,...
141206848,77257484,59,9286
141206849,77257491,31,318
141206850,77257491,59,9286
141206851,77257493,31,318


In [5]:
get_mem_usage(qpq_df) # 1616 MB

1616

In [5]:
qpq_df.max(axis=0)

source_item_id      77257493
edge_property_id        7643
target_item_id      77219312
dtype: uint32

In [6]:
## keep cases were it's an instance of 

qpq_df = qpq_df.loc[
    (qpq_df["edge_property_id"] == 31) | (qpq_df["edge_property_id"] == 279)
].drop_duplicates()
qpq_df


Unnamed: 0,source_item_id,edge_property_id,target_item_id
0,1,31,36906466
1,1,279,3695190
37,2,31,3504248
95,3,31,937228
96,3,279,203872
...,...,...,...
141206843,77257472,31,318
141206845,77257483,31,318
141206847,77257484,31,318
141206849,77257491,31,318


# Subclass Graphs

In [7]:
import pickle

In [10]:
#property_graph = nx.DiGraph()

#property_graph.add_edges_from(
#    qpq_df[["source_item_id", "target_item_id"]].values
#)


In [11]:
#with open('property_graph_all.pickle', 'wb') as handle:
 #   pickle.dump(property_graph, handle)

In [8]:
with open('property_graph_all.pickle', 'rb') as handle:
    property_graph = pickle.load(handle)

In [None]:
handle.close()

In [None]:
root_qids = {
    "Work_of_art": 838948,  # https://www.wikidata.org/wiki/Q66715801 (musical_profession)
} 

In [None]:
qpq_df[qpq_df["target_item_id"] == 838948]

Unnamed: 0,source_item_id,edge_property_id,target_item_id
436650,23323,31,838948
454077,24856,279,838948
518873,31732,279,838948
610416,40831,279,838948
1614800,118807,279,838948
...,...,...,...
141076766,77113046,31,838948
141076773,77113048,31,838948
141076780,77113053,31,838948
141076787,77113058,31,838948


In [None]:
subclass_qids = {
    lbl: set(nx.ancestors(property_graph, qid)).union(set([qid]))
    for lbl, qid in root_qids.items()
}

# Subclass Signatures

In [None]:
df = pd.DataFrame(index=set(qpq_df.index))
df.index.name = 'qid'

In [None]:
df.head()

0
1
67108865
67108866
67108867


In [None]:
qpq_signature_dfs = {}
#mask1 = qpq_df["edge_property_id"] == 31  ### orig
mask1 = qpq_df['edge_property_id']==(31 or 279) ## alt

### P31 : that class of which this subject is a particular example and member - is a
for lbl, qid in root_qids.items():
    mask2 = qpq_df["target_item_id"].isin(subclass_qids[lbl])
    qpq_signature_dfs[lbl] = qpq_df[mask1 & mask2][["source_item_id", "target_item_id"]]

    qpq_signature_dfs[lbl].set_index("source_item_id", drop=True, inplace=True)
    qpq_signature_dfs[lbl].index.name = "qid"

    # de-duplicate index
    qpq_signature_dfs[lbl] = qpq_signature_dfs[lbl][~qpq_signature_dfs[lbl].index.duplicated()]

    # add to dataframe
    df[lbl] = qpq_signature_dfs[lbl]["target_item_id"]


In [None]:
del mask1
del mask2
del property_graph
del qpq_signature_dfs
del qpq_df

In [None]:
df = df.fillna(0).astype(np.int)
df = df[df["Work_of_art"] > 0]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  df = df.fillna(0).astype(np.int)


In [None]:
df

Unnamed: 0_level_0,Work_of_art
qid,Unnamed: 1_level_1
149,63412991
844,24856
904,21198342
1351,1344
7216,11424
...,...
67102008,24862
67102980,371752
67103629,11424
67103743,11424


In [29]:
with open('df_filtered.pickle', 'wb') as handle:
    pickle.dump(df, handle)

In [23]:
items = pd.read_csv("./data/archive/item.csv")

In [24]:
filtered_items = items[items["item_id"].isin(pd.unique(df.index))].copy()

In [25]:
filtered_items_JUREK = items[items["item_id"].isin(pd.unique(df["Work_of_art"]))].copy()

In [26]:
del items

## output

In [27]:
Wojtek = ["CARDINAL", "DATE", "EVENT", "FAC", "GPE", "LANGUAGE", "LAW", "LOC", "MONEY", "NORP", "ORDINAL", "ORG", "PERCENT", "PERSON", "PRODUCT", "QUANTITY", "TIME", "WORK_OF_ART"]

In [28]:
for item in Wojtek:
    print(item, spacy.explain(item))

NameError: name 'spacy' is not defined

In [None]:
import gc
gc.collect()

25533139