# Data Visualization

Import necessary libraries and define paths to the data files.

In [14]:
from pathlib import Path
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import json
import numpy as np

BASE = Path().parent.parent.parent / "WikiData.nosync"

## Distribution of PIDs per QID

This block loads the SQLite database, counts how many property IDs (PIDs) are associated with each QID and plots the distribution. PIDs counts are grouped in pairs (1–2, 3–4, etc.).

In [15]:
DB_PATH = BASE / "wikidata_labeled_wo.db"
conn = sqlite3.connect(DB_PATH)
query = "SELECT qid, COUNT(pid) as pid_count FROM properties_labeled GROUP BY qid"
df = pd.read_sql_query(query, conn)
conn.close()

df['bucket'] = ((df['pid_count'] - 1) // 2) * 2 + 1
bucket_counts = df['bucket'].value_counts().sort_index()

plt.figure(figsize=(8,4))
plt.bar([f"{b}-{b+1}" for b in bucket_counts.index], bucket_counts.values)
plt.xlabel('PID count range')
plt.ylabel('Number of QIDs')
plt.title('Distribution of PIDs per QID')
plt.tight_layout()
plt.show()

OperationalError: unable to open database file

## Statistics of Death Dates

This block reads `death_dates_clean.json` and calculates the median and mean of the death dates.

In [12]:
DEATH_PATH = BASE / "death_dates_clean.json"
with open(DEATH_PATH, 'r', encoding='utf-8') as f:
    death_data = json.load(f)

dates = pd.to_datetime(list(death_data.values()), errors='coerce').dropna()

median_date = dates.median()
mean_date = dates.mean()

print(f"Median death date: {median_date.date()}")
print(f"Mean death date: {mean_date.date()}")

FileNotFoundError: [Errno 2] No such file or directory: 'WikiData.nosync/death_dates_clean.json'