In [1]:
import requests
import numpy as np
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
from typing import Optional, List, Tuple, Dict, Any
import time # Import time for potential delays/retries
import calendar, datetime
from datetime import datetime, timezone

# --- Wikidata Interaction Setup ---
from wikibaseintegrator import WikibaseIntegrator, wbi_login
from wikibaseintegrator.datatypes import String
from wikibaseintegrator.wbi_config import config as wbi_config

# Configure User Agent and Login (using OAuth2 as in the original code)
# IMPORTANT: Replace placeholders if necessary, ensure credentials are secure
wbi_config['USER_AGENT'] = 'MyWikibaseBot/1.0 (https://www.wikidata.org/wiki/User:Wind2375like)' # Please update URL if applicable
try:
    # Ensure your consumer token/secret are correctly set up for OAuth2
    login = wbi_login.OAuth2(consumer_token='8af6056534b5a48d227d6f36c1d44e7f', consumer_secret='f4efe05f57d81a14b4f646f3228f214773e9299a')
    wbi = WikibaseIntegrator(login=login)
    print("WikibaseIntegrator initialized with OAuth2 login.")
except Exception as e:
    print(f"Warning: Failed to initialize WikibaseIntegrator with OAuth2 login: {e}")
    print("Falling back to anonymous WikibaseIntegrator.")
    # Fallback to anonymous access if OAuth fails or is not needed for reads
    wbi = WikibaseIntegrator()

WikibaseIntegrator initialized with OAuth2 login.


In [None]:
# --- Configuration ---
MAX_DEPTH = 5 # Default max depth for chain building
ALLOWED_PROPS = {
    "P39", # position held
    "P36", "P35", "P6", "P20", "P26", "P140", "P1412",
    "P19", "P69", "P40", "P27", "P175", "P108", "P112", "P50",
    "P170", "P407", "P37", "P740", "P495", "P106", "P136", "P364",
    "P937", "P800", "P641", "P413", "P286", "P159", "P178", "P488",
    "P169", "P449", "P176", "P1037", "P1308"
    # Add more properties relevant to potential changes if needed
}
UA = "MyWikibaseBot/1.0 (https://www.wikidata.org/wiki/User:Wind2375like)"
# THRESHOLD_LINK_COUNT = 1 # Original threshold, not directly used in build_chain logic but kept for context

In [12]:
import requests
import numpy as np
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
from datetime import datetime, timezone
from wikibaseintegrator import wbi_helpers, WikibaseIntegrator
from typing import List, Tuple, Set

def get_wikidata_label(entity_id: str, lang: str = "en") -> str:
    """
    Fetches the label for a Wikidata entity (e.g. 'Q42' or 'P31') in the specified language.

    :param entity_id: The Wikidata ID (e.g. 'Q42' for Douglas Adams, 'P31' for instance of).
    :param lang:       The language code for the label (default 'en').
    :return:           The label string, or an empty string if not found.
    """
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "ids": entity_id,
        "props": "labels",
        "languages": lang,
        "format": "json"
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    data = response.json()

    # Navigate the JSON to extract the label
    entities = data.get("entities", {})
    entity = entities.get(entity_id, {})
    labels = entity.get("labels", {})
    label_info = labels.get(lang, {})

    return label_info.get("value", "")

def recent_items_since_rc(
    since_iso: str,
    limit: int = 50,
    namespaces: str = "0"              # 0 = items only
) -> List[str]:
    """
    Return up to `limit` Q-ids whose *item* was edited after `since_iso` (UTC),
    using only the MediaWiki RecentChanges feed (no WDQS).
    """
    data = {
        "action": "query",
        "list":   "recentchanges",
        "rcnamespace": namespaces,
        "rcstart": since_iso,          # ISO 8601, newest first → use as lower bound
        "rcdir":   "newer",
        "rctype":  "edit|new",
        "rclimit": limit,
        "rcprop":  "title",
        "format":  "json"
    }
    rc_json = wbi_helpers.mediawiki_api_call_helper(
        data=data,
        allow_anonymous=True,
        user_agent=UA
    )
    return [
        rc["title"] for rc in rc_json["query"]["recentchanges"]
        if rc["title"].startswith("Q")
    ][:limit]

def recent_triples_since_rc(
    since_iso: str,
    allowed_props: Set[str],
    limit: int = 50
) -> List[Tuple[str, str, str]]:
    """
    Return up to `limit` (s, p, o) triples where the statement lives on an
    item edited after `since_iso` and `p` is in `allowed_props`.
    Works only with the Wikidata API — no SPARQL, no WDQS timeouts.
    """
    triples: List[Tuple[str, str, str]] = []
    for qid in recent_items_since_rc(since_iso, limit * 10):   # oversample a bit
        claims = wbi.item.get(entity_id=qid).get_json().get("claims", {})
        for prop, claim_list in claims.items():
            if prop not in allowed_props or len(claim_list) != 1:
                continue
            val = (claim_list[0]
                   .get("mainsnak", {})
                   .get("datavalue", {})
                   .get("value", {}))
            if isinstance(val, dict) and "id" in val:
                if get_wikidata_label(qid) == "" or get_wikidata_label(val["id"]) == "":
                    continue
                triples.append((qid, prop, val["id"]))
                if len(triples) >= limit:
                    return triples
    return triples

def get_random_qid() -> str:
    """
    Uses the MediaWiki API to pick a random main-namespace page
    and returns its Q‑ID (always starts with 'Q'). Loops until
    a valid Q‑ID is found.
    """
    session = requests.Session()
    URL = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'query',
        'list': 'random',
        'rnnamespace': 0,
        'rnlimit': 1,
        'format': 'json'
    }

    while True:
        resp = session.get(URL, params=params).json()
        title = resp['query']['random'][0]['title']
        # Only accept titles that look like valid Q‑IDs
        if title.startswith("Q") and title[1:].isdigit():
            return title

def get_link_counts(qid):
    """
    Use SPARQL to get the number of links for a given QID
    """

    endpoint_url = "https://query.wikidata.org/sparql"

    query = f"""SELECT ?item ?count WHERE {{
        VALUES ?item {{ wd:{qid} }} .
        ?item wikibase:sitelinks ?count .
    }}"""

    user_agent = "getLinkCounts (https://github.com/Wind-2375-like)"
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    if len(results["results"]["bindings"]) == 0:
        print(f"No results found for {qid}")
        return 0
    count = results["results"]["bindings"][0]["count"]["value"]
    count = int(count)
    return count

In [24]:
def _claim_active_at(claim: dict, when: datetime) -> bool:
    """
    Return True iff the claim is valid at the given instant, according to
    its P580 (start time) / P582 (end time) qualifiers.  If neither qualifier
    is present we treat the claim as always-valid.
    """
    qs = claim.get("qualifiers", {})
    fmt = "%Y-%m-%dT%H:%M:%SZ"

    # start time
    if "P580" in qs:
        start_raw = qs["P580"][0]["datavalue"]["value"]["time"]   # "+2021-01-20T00:00:00Z"
        try:
            start = datetime.strptime(start_raw[1:], fmt).replace(tzinfo=timezone.utc)
        except ValueError:
            # Handle the case where the date is not in the expected format
            return False
        if when < start:
            return False

    # end time
    if "P582" in qs:
        end_raw = qs["P582"][0]["datavalue"]["value"]["time"]
        try:
            end = datetime.strptime(end_raw[1:], fmt).replace(tzinfo=timezone.utc)
        except ValueError:
            # Handle the case where the date is not in the expected format
            return False
        if when >= end:
            return False     # ended *before* or *at* this instant

    return True

def _has(claims: dict, prop: str, obj: str, when: datetime) -> bool:
    for claim in claims.get(prop, []):
        dv = claim.get("mainsnak", {}).get("datavalue", {})
        val = dv.get("value", {})
        if isinstance(val, dict) and val.get("id") == obj:
            if _claim_active_at(claim, when):
                return True
    return False

def _to_dt(month_str: str, end: bool) -> datetime:
    """'YYYY-MM' → last (or first) second of that month, in UTC."""
    y, m = map(int, month_str.split('-'))
    d = calendar.monthrange(y, m)[1] if end else 1
    hms = "23:59:59" if end else "00:00:00"
    return datetime.strptime(f"{y:04d}-{m:02d}-{d:02d} {hms}", "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc)

def is_updated_triple(triple, before_time, after_time):
    subj, prop, obj = triple
    before_dt = _to_dt(before_time, end=True)
    after_dt  = _to_dt(after_time,  end=True)

    claims_now = wbi.item.get(entity_id=subj).get_json()["claims"]

    was_true = _has(claims_now, prop, obj, before_dt)
    is_true  = True

    return was_true != is_true

def build_chain(start_qid: str, max_depth: int = 5):
    """
    Returns a list of (e_i, p_i, e_{i+1}) up to max_depth,
    never revisiting the same entity.
    """
    chain = []
    visited = {start_qid}
    current = start_qid

    for i in range(max_depth):
        # 1) Fetch the item
        item = wbi.item.get(entity_id=current)
        claims = item.get_json().get('claims', {})

        # 2) Scan for the first outgoing Item link that doesn't revisit
        found = False
        for prop, claim_list in claims.items():
            # Skip if this property is not in the allowed set
            if prop not in ALLOWED_PROPS:
                continue

            # Skip if not a single claim
            if len(claim_list) != 1:
                continue
            
            claim = claim_list[0]
            dv = claim.get('mainsnak', {}).get('datavalue', {})
            val = dv.get('value', {})

            # check that it’s an item link
            if isinstance(val, dict) and 'id' in val:
                nxt = val['id']
                if nxt in visited:
                    # would form a cycle
                    continue
                
                if get_wikidata_label(current) == "" or get_wikidata_label(nxt) == "" or get_wikidata_label(prop) == "":
                    # skip if any label is empty
                    continue
                
                # if i == 0 and not is_updated_triple(triple=(current, prop, nxt), before_time="2023-12", after_time="2025-04"):
                #     # skip if the triple is not updated
                #     continue

                # record the triple and advance the chain
                chain.append({
                    "triple": (current, prop, nxt),
                    "triple_label": (get_wikidata_label(current), get_wikidata_label(prop), get_wikidata_label(nxt)),
                })
                visited.add(nxt)
                current = nxt
                found = True
                break

        if not found:
            # no further step possible
            break

    return chain

def sample_chain_exact(depth: int) -> List[Tuple[str, str, str]]:
    """
    Sample a chain with exactly length==depth such that:
    1. No entity is revisited (i.e. no cycles)
    2. The first triple is updated between 2023-12 and 2025-04
    3. The properties are in the allowed set
    4. There is only one claim for each property
    5. The labels for the entities and properties are not empty
    """
    tries = 0
    while True:
        tries += 1
        # start = recent_triples_since_rc("2023-12-31T00:00:00Z", allowed_props=ALLOWED_PROPS, limit=1)
        start = get_random_qid()
        if not start:
            continue
        # chain = build_chain(start[0][0], max_depth=depth)
        chain = build_chain(start, max_depth=depth)
        if len(chain) == depth:
            # print(f"Found after {tries} tries, start={start}")
            return chain

# print(sample_chain_exact(depth=5))

In [25]:
from tqdm import tqdm
chains = []
for i in tqdm(range(100)):
    try:
        chain = sample_chain_exact(depth=5)
        chains.append(chain)
        # print(f"Chain {i+1}: {chain}")
    except Exception as e:
        print(f"Error on iteration {i+1}: {e}")
        time.sleep(1)  # Sleep to avoid hitting API limits or causing too many errors
        continue

  2%|▏         | 2/100 [03:10<2:43:18, 99.99s/it]

Error on iteration 3: ('Connection aborted.', TimeoutError(60, 'Operation timed out'))


  3%|▎         | 3/100 [10:23<6:47:29, 252.06s/it]Connection error: ('Connection aborted.', OSError(49, "Can't assign requested address")). Sleeping for 60 seconds.
Traceback (most recent call last):
  File "/opt/miniconda3/envs/hallucination/lib/python3.9/site-packages/urllib3/connectionpool.py", line 789, in urlopen
    response = self._make_request(
  File "/opt/miniconda3/envs/hallucination/lib/python3.9/site-packages/urllib3/connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
  File "/opt/miniconda3/envs/hallucination/lib/python3.9/site-packages/urllib3/connection.py", line 464, in getresponse
    httplib_response = super().getresponse()
  File "/opt/miniconda3/envs/hallucination/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/opt/miniconda3/envs/hallucination/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/opt/miniconda3/envs/hallucination/lib/python3.9

In [26]:
# Save the chains to a file
import pickle
with open("chains.pkl", "wb") as f:
    pickle.dump(chains, f)

- [give up too long] Start from a changed triple from 2023-12 to 2025-04 (now) (20min)
- [x] Generate 100 chains (immediately)
- [x] Summarize the code (10min)
- [] Use the templates to generate probe questions and whole questions (30min)
- [] Test script for Llama 3.2 on questions (10min)
- [] Test script for Llama 3.2 on triples of failed questions (10min)
- [] Script for Llama 3.2 to inject new knowledge (10min) 

In [None]:
# 1. The final format for each chain is a dictionary with the following keys:
#    - "triples": a list of dictionaries, each containing:
#       - "triple": a tuple of (subject, property, object) IDs
#       - "triple_label": a tuple of (subject label, property label, object label)
#    - "probe_questions": a list of dictionaries with the same length as triples, each containing:
#        - "question": the question-like probe, e.g., Which continent is [S] located in? (P30)
#        - "cloze": the cloze-style probe, e.g., [S] is located in the continent of ___ (P30)
#        - "answer": the answer to the probe, e.g., Asia (Q48)
#    - "multihop_questions": list of strings (3 paraphrased multi-hop questions asking the tail entity from the head entity)
#    - "multihop_answers": list of strings (aliases of the tail entity)

[[{'triple': ('Q102400597', 'P69', 'Q1190812'),
   'triple_label': ('Xi Ronald Chen',
    'educated at',
    'University of New Mexico')},
  {'triple': ('Q1190812', 'P159', 'Q34804'),
   'triple_label': ('University of New Mexico',
    'headquarters location',
    'Albuquerque')},
  {'triple': ('Q34804', 'P6', 'Q7807370'),
   'triple_label': ('Albuquerque', 'head of government', 'Timothy M. Keller')},
  {'triple': ('Q7807370', 'P27', 'Q30'),
   'triple_label': ('Timothy M. Keller',
    'country of citizenship',
    'United States')},
  {'triple': ('Q30', 'P35', 'Q22686'),
   'triple_label': ('United States', 'head of state', 'Donald Trump')}],
 [{'triple': ('Q17328085', 'P170', 'Q175257'),
   'triple_label': ("Een beer overvalt een houthakker, genaamd 'Strijd op leven en dood'",
    'creator',
    'Charles Verlat')},
  {'triple': ('Q175257', 'P19', 'Q12892'),
   'triple_label': ('Charles Verlat', 'place of birth', 'Antwerp')},
  {'triple': ('Q12892', 'P6', 'Q336599'),
   'triple_label'