In [None]:
import json
from pathlib import Path
from collections import Counter
from datetime import datetime
from statistics import mean, median
import matplotlib.pyplot as plt

This notebook visualizes how many property IDs (PIDs) each entity (QID) has.

In [None]:
BASE = Path(__file__).resolve().parent.parent.parent / 'WikiData.nosync'
facts_file = BASE / 'human_facts.json'

with open(facts_file, 'r', encoding='utf-8') as f:
    human_facts = json.load(f)

# count PIDs per QID
pid_counts = [len(props) for props in human_facts.values()]

# frequency distribution
count_freq = Counter(pid_counts)
print('Distribution of PIDs per QID:')
for n, c in sorted(count_freq.items()):
    print(f'{n}: {c}')

# plot histogram with bins [1-2], [3-4], ...
max_count = max(pid_counts)
bins = range(0, max_count + 2, 2)
plt.hist(pid_counts, bins=bins, edgecolor='black')
plt.xlabel('Number of PIDs per QID (binned by 2)')
plt.ylabel('Number of QIDs')
plt.title('Distribution of PIDs linked to QIDs')
plt.xticks([b+1 for b in bins])
plt.show()

The next cell reads `death_dates_clean.json` and computes the mean and median death dates.

In [None]:
BASE = Path(__file__).resolve().parent.parent.parent / 'WikiData.nosync'
death_file = BASE / 'death_dates_clean.json'

with open(death_file, 'r', encoding='utf-8') as f:
    death_dates = json.load(f)

# convert ISO date strings to ordinal numbers for statistics
ordinals = [datetime.fromisoformat(date).toordinal() for date in death_dates.values()]
mean_date = datetime.fromordinal(int(mean(ordinals)))
median_date = datetime.fromordinal(int(median(ordinals)))
print('Mean death date:', mean_date.date())
print('Median death date:', median_date.date())