Data taken from: [Kensho Derived Wikimedia Dataset](https://www.kaggle.com/datasets/kenshoresearch/kensho-derived-wikimedia-data)

Potential topics:
 - [astronomical object](https://www.wikidata.org/wiki/Q17444909)
 - [astronomy](https://www.wikidata.org/wiki/Q333)
 - [planetary science](https://www.wikidata.org/wiki/Q104499)


In [16]:
from pathlib import Path
import pickle
import networkx as nx
import numpy as np
import pandas as pd


In [17]:
with open('df_filtered.pickle', 'rb') as handle:
    data_filtered = pickle.load(handle)

In [18]:
data_filtered.head(5)

Unnamed: 0_level_0,Work_of_art
qid,Unnamed: 1_level_1
149,63412991
844,24856
904,21198342
1351,1344
7216,11424


In [19]:
!du -h -d 1 data/archive/* | sort -hr


'du' is not recognized as an internal or external command,
operable program or batch file.


In [20]:
DATA_DIR = Path("data/archive")

In [21]:
def get_mem_usage(df):
    return round(df.memory_usage(index=True, deep=True).sum() / 1024 ** 2)

# Statements

In [22]:
qpq_df = pd.read_csv(DATA_DIR / "statements.csv", dtype=np.uint32)
qpq_df

Unnamed: 0,source_item_id,edge_property_id,target_item_id
0,1,31,36906466
1,1,279,3695190
2,1,398,497745
3,1,398,1133705
4,1,398,1139177
...,...,...,...
141206848,77257484,59,9286
141206849,77257491,31,318
141206850,77257491,59,9286
141206851,77257493,31,318


In [29]:
items = pd.read_csv(DATA_DIR / "item.csv")
items

Unnamed: 0,item_id,en_label,en_description
0,1,Universe,totality of space and all contents
1,2,Earth,third planet from the Sun in the Solar System
2,3,life,matter capable of extracting energy from the e...
3,4,death,permanent cessation of vital functions
4,5,human,"common name of Homo sapiens, unique extant spe..."
...,...,...,...
51450311,77257472,2dFGRS TGN256Z026,
51450312,77257483,2dFGRS TGS171Z171,
51450313,77257484,2dFGRS TGS373Z078,
51450314,77257491,2dFGRS TGS374Z114,


In [23]:
get_mem_usage(qpq_df) # 1616 MB

1616

In [24]:
qpq_df.max(axis=0)

source_item_id      77257493
edge_property_id        7643
target_item_id      77219312
dtype: uint32

In [26]:
## keep cases were it's an instance of 
# 31 INSTANCE OF 
# 106 OCCUPATION
qpq_df = qpq_df.loc[
    (qpq_df["edge_property_id"] == 31) | (qpq_df["edge_property_id"] == 106)
].drop_duplicates().reset_index(drop=True)
qpq_df


Unnamed: 0,source_item_id,edge_property_id,target_item_id
0,1,31,36906466
1,2,31,3504248
2,3,31,937228
3,4,31,2996394
4,5,31,55983715
...,...,...,...
30310507,77257472,31,318
30310508,77257483,31,318
30310509,77257484,31,318
30310510,77257491,31,318


In [31]:
nowedane = qpq_df[(qpq_df["edge_property_id"] == 106) & (qpq_df["target_item_id"].isin([36834, 639669, 486748]))]

In [36]:
filtered_data = nowedane.drop_duplicates(subset=["source_item_id"])["source_item_id"]

480              206
569              254
576              255
672              303
856              392
              ...   
30288228    77197510
30289026    77201716
30292230    77216816
30294268    77224469
30305577    77252336
Name: source_item_id, Length: 134257, dtype: uint32

In [34]:
items[items["item_id"]==77252336]

Unnamed: 0,item_id,en_label,en_description
51445426,77252336,Pietro Formichi,


In [42]:
ppl_filtered = items[items["item_id"].isin(filtered_data.values)].dropna().reset_index(drop=True)
ppl_filtered

Unnamed: 0,item_id,en_label,en_description
0,206,Stephen Harper,22nd Prime Minister of Canada
1,254,Wolfgang Amadeus Mozart,Austrian composer of the Classical period
2,255,Ludwig van Beethoven,German classical and romantic composer
3,303,Elvis Presley,American singer and actor
4,392,Bob Dylan,"American recording artist; singer-songwriter, ..."
...,...,...,...
90014,77197048,Gregg Maizel,"musician, member of the band Vigil"
90015,77197304,Jo Connor,"musician, member of the band Vigil"
90016,77197507,Andy R,"musician, member of the band Vigil"
90017,77197510,X Factor,"musician, member of the band Vigil"


In [45]:
to_pickle = set(ppl_filtered["item_id"].values)

In [47]:
with open('ppl_filtered.pkl', 'wb') as handle:
    pickle.dump(to_pickle, handle)