# Suche nach Prominenten

In [3]:
import pandas as pd
from pathlib import Path
import requests

In [4]:
input_data_folder = "../data/"
input_data_filename = "AAB_cleaned.csv"
output_data_filename = "AAB_wikidata.csv"

In [5]:
data_file = Path(input_data_folder) / input_data_filename
df_cars = pd.read_csv(data_file, sep= "\t")  

# Variante für Google Colab
# df_cars = pd.read_csv( "https://raw.githubusercontent.com/SebastianZug/RoboLabVortraege/refs/heads/main/30_PythonAutomobile/project/data/AAB_cleaned.csv", sep= "\t")  

## Anfrage an Wikimedia API

Beispieldaten

http://www.wikidata.org/entity/Q937

https://de.wikipedia.org/wiki/Albert_Einstein

In [6]:
def search_person_wikidata(first_name, last_name, residence=None, birth_year_min=1850, birth_year_max=1950):
    endpoint_url = "https://query.wikidata.org/sparql"
    
    # Basis-Query erstellen
    query = f"""
    SELECT ?person ?personLabel ?birthDate ?deathDate ?residenceLabel ?occupationLabel ?article WHERE {{
      ?person wdt:P31 wd:Q5;  # Instance of human
              wdt:P735 ?givenName;  # Given name
              wdt:P734 ?familyName.  # Family name
      ?givenName rdfs:label "{first_name}"@en.
      ?familyName rdfs:label "{last_name}"@en.
      
      OPTIONAL {{ ?person wdt:P569 ?birthDate. }}
      OPTIONAL {{ ?person wdt:P570 ?deathDate. }}
      OPTIONAL {{ ?person wdt:P551 ?residence. }}  # Residence
      OPTIONAL {{ ?person wdt:P106 ?occupation. }}  # Occupation
      OPTIONAL {{ 
        ?article schema:about ?person;
                 schema:isPartOf <https://de.wikipedia.org/>.
      }}
    """
    
    # Wohnort-Filter hinzufügen, wenn angegeben
    if residence:
        query += f"""
      ?person wdt:P551 ?residence.
      ?residence rdfs:label ?residenceLabel.
      FILTER(CONTAINS(LCASE(?residenceLabel), LCASE("{residence}")))
    """
    
    # Geburtsjahr-Filter hinzufügen
    query += f"""
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],de,en". }}
      FILTER ((?birthDate >= "{birth_year_min}-01-01"^^xsd:dateTime) && 
              (?birthDate <= "{birth_year_max}-12-31"^^xsd:dateTime))
    }}
    LIMIT 10
    """
    
    headers = {
        "Accept": "application/sparql-results+json",
    }
    
    try:
        response = requests.get(endpoint_url, params={"query": query}, headers=headers)
        response.raise_for_status()  # Wirft eine Exception bei HTTP-Fehlern
        
        data = response.json()
        results = data.get("results", {}).get("bindings", [])
        samples = []
        
        for result in results:
            sample = {
                "person": result.get("person", {}).get("value"),
                "personLabel": result.get("personLabel", {}).get("value"),
                "birthDate": result.get("birthDate", {}).get("value"),
                "deathDate": result.get("deathDate", {}).get("value"),
                "residenceLabel": result.get("residenceLabel", {}).get("value"),
                "occupationLabel": result.get("occupationLabel", {}).get("value"),
                "article": result.get("article", {}).get("value"),
            }
            samples.append(sample)
        
        return (samples, 1)
    
    except requests.exceptions.RequestException as e:
        print(f"Fehler bei der Anfrage: {str(e)}")
        return ([]), 0

def receive_wikidata_as_pd(vorname="Albert", nachname="Einstein", wohnort="Schaffhausen", geburtsjahr_min=1850, geburtsjahr_max=1900):
    result_raw, success = search_person_wikidata(vorname, nachname, wohnort, geburtsjahr_min, geburtsjahr_max)
    if len(result_raw) == 0:
        return (pd.DataFrame(), success)
    else:
        result_df = pd.DataFrame(result_raw)
        return (result_df.groupby("person").agg(
            {
                "personLabel": "first",
                "birthDate": "first",
                "deathDate": "first",
                "residenceLabel": "unique",
                "occupationLabel": "unique",
                "article": "first",
            }
        ), success)


In [7]:
# Beispielaufruf
df = receive_wikidata_as_pd("Albert", "Einstein", "Schaffhausen", 1800, 1880)
df[0]

Unnamed: 0_level_0,personLabel,birthDate,deathDate,residenceLabel,occupationLabel,article
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
http://www.wikidata.org/entity/Q937,Albert Einstein,1879-03-14T00:00:00Z,1955-04-18T00:00:00Z,[Schaffhausen],[Physiker],https://de.wikipedia.org/wiki/Albert_Einstein


## ... Und jetzt für unseren Datensatz

In [8]:
# Überprüfe, ob eine Datei "AAB_wikidata.csv" bereits existiert, wenn nicht, erstelle sie als Kopie von df_cars
output_data_filename = "AAB_wikidata.csv"
output_data_folder = "../data/"
output_file = Path(output_data_folder) / output_data_filename

if not output_file.exists():
    df_cars_wikidata = df_cars['id'].copy().to_frame()
    df_cars_wikidata["wikidata"] = "not checked"
    df_cars_wikidata.to_csv(output_file, sep="\t", index=False)
else:
    df_cars_wikidata = pd.read_csv(output_file, sep="\t")

In [15]:
import time
import tqdm
import numpy as np

df_wikidata = pd.DataFrame()

for index, row in tqdm.tqdm(df_cars.iterrows(), total=len(df_cars)):

    first_name = row["Vorname"]
    last_name = row["Familienname"]
    location = row["Wohnort"]
    id = row["id"]

    if df_cars_wikidata.loc[df_cars_wikidata["id"] == id, "wikidata"].values[0] == "checked":
        continue

    result_df, success = receive_wikidata_as_pd(first_name, last_name, location, 1850, 1900)
    result_df["id"] = id

    if success==1:

        if not result_df.empty:

            print(first_name, last_name, location, id, result_df["article"].values[0])

            df_cars_wikidata.loc[ df_cars_wikidata["id"] == id, "wiki_person"] = result_df["personLabel"].values[0]
            df_cars_wikidata.loc[ df_cars_wikidata["id"] == id, "wiki_geburtsdatum"] = result_df["birthDate"].values[0]
            df_cars_wikidata.loc[ df_cars_wikidata["id"] == id, "wiki_todesdatum"] = result_df["deathDate"].values[0]

            locations = result_df["residenceLabel"].iloc[0]
            locations_string = np.array2string(locations, separator=', ')
            df_cars_wikidata.loc[ df_cars_wikidata["id"] == id, "wiki_wohnort"] = locations_string

            ocupation = result_df["occupationLabel"].iloc[0]
            occupation_string = np.array2string(ocupation, separator=', ')

            df_cars_wikidata.loc[ df_cars_wikidata["id"] == id, "wiki_beruf"] = occupation_string
            df_cars_wikidata.loc[ df_cars_wikidata["id"] == id, "wiki_url"] = result_df["article"].values[0]

        df_cars_wikidata.loc[ df_cars_wikidata["id"] == id, "wikidata"] = "checked"
    else:
        df_cars_wikidata.loc[ df_cars_wikidata["id"] == id, "wikidata"] = "error"

    df_cars_wikidata.to_csv(output_file, sep="\t", index=False)

    if success==1 and not result_df.empty:
        print(f"Found {len(result_df)} entries for {first_name} {last_name} in Wikidata")

    time.sleep(1)  # Wartezeit von 1 Sekunde, um die Wikidata-API nicht zu überlasten

 62%|██████▏   | 28176/45455 [00:05<00:03, 5483.75it/s]

Fehler bei der Anfrage: 400 Client Error: Bad Request for url: https://query.wikidata.org/sparql?query=%0A++++SELECT+%3Fperson+%3FpersonLabel+%3FbirthDate+%3FdeathDate+%3FresidenceLabel+%3FoccupationLabel+%3Farticle+WHERE+%7B%0A++++++%3Fperson+wdt%3AP31+wd%3AQ5%3B++%23+Instance+of+human%0A++++++++++++++wdt%3AP735+%3FgivenName%3B++%23+Given+name%0A++++++++++++++wdt%3AP734+%3FfamilyName.++%23+Family+name%0A++++++%3FgivenName+rdfs%3Alabel+%22Karl%22%40en.%0A++++++%3FfamilyName+rdfs%3Alabel+%22Wider%2C+Dr.+med.%22%40en.%0A++++++%0A++++++OPTIONAL+%7B+%3Fperson+wdt%3AP569+%3FbirthDate.+%7D%0A++++++OPTIONAL+%7B+%3Fperson+wdt%3AP570+%3FdeathDate.+%7D%0A++++++OPTIONAL+%7B+%3Fperson+wdt%3AP551+%3Fresidence.+%7D++%23+Residence%0A++++++OPTIONAL+%7B+%3Fperson+wdt%3AP106+%3Foccupation.+%7D++%23+Occupation%0A++++++OPTIONAL+%7B+%0A++++++++%3Farticle+schema%3Aabout+%3Fperson%3B%0A+++++++++++++++++schema%3AisPartOf+%3Chttps%3A%2F%2Fde.wikipedia.org%2F%3E.%0A++++++%7D%0A++++%0A++++++%3Fperson+wdt%3AP551+

100%|██████████| 45455/45455 [00:09<00:00, 4775.14it/s]


## Auswertung

In [9]:
# lies die Daten aus der Datei
df_cars_wikidata = pd.read_csv(output_file, sep="\t")

In [17]:
df_cars_wikidata.wikidata.value_counts()

wikidata
checked    45454
error          1
Name: count, dtype: int64

In [16]:
from IPython.display import HTML

HTML(df_cars_wikidata.dropna(subset=["wiki_person"]).head(5).to_html(render_links=True, escape=False))

Unnamed: 0,id,wikidata,wiki_person,wiki_geburtsdatum,wiki_todesdatum,wiki_wohnort,wiki_beruf,wiki_url
3301,17121190,checked,Benno Arnold,1876-11-21T00:00:00Z,1943-03-03T00:00:00Z,"['Augsburg', 'Augsburgo']","['Augsburg', 'Augsburgo']",https://de.wikipedia.org/wiki/Benno_Arnold_(Industrieller)
3760,17121354,checked,Emil Klein,1865-01-01T00:00:00Z,1943-01-01T00:00:00Z,['Stuttgart'],['Stuttgart'],https://de.wikipedia.org/wiki/Emil_Klein_(Maler)
5358,17097283,checked,Wilfrid Israel,1899-07-11T00:00:00Z,1943-06-01T00:00:00Z,['Berlin-Mitte'],['Berlin-Mitte'],https://de.wikipedia.org/wiki/Wilfrid_Israel
5558,17097036,checked,Friedrich Gutmann,1886-11-15T00:00:00Z,1944-04-19T00:00:00Z,"['Berlin', 'I-Berlin', 'IBerlini']","['Berlin', 'I-Berlin', 'IBerlini']",https://de.wikipedia.org/wiki/Friedrich_Gutmann
5577,17097125,checked,Paul Müller,1881-02-26T00:00:00Z,1940-01-01T00:00:00Z,['Berlin-Kreuzberg'],[None],
