## Scraping Source 1: WikiData

In [2]:
# All current and subsequent imports here

import requests
import pandas as pd
import time
import math

In [3]:
ENDPOINT = "https://query.wikidata.org/sparql"
HEADERS = {
    "User-Agent": "EU-Startup-Analysis/1.0 (contact: your@email)"
}

# AT, DE, FR, IT, NL, ES, SE + UK, CH, NO, DK
countries = [
    "wd:Q40",  # Austria
    "wd:Q183", # Germany
    "wd:Q142", # France
    "wd:Q38",  # Italy
    "wd:Q55",  # Netherlands
    "wd:Q29",  # Spain
    "wd:Q34",  # Sweden
    "wd:Q145", # United Kingdom
    "wd:Q39",  # Switzerland
    "wd:Q20",  # Norway
    "wd:Q35",  # Denmark
]

page_size = 1000   # results per page
max_pages_per_country = 20    # Acceptable limit. ANymore results in http errors
sleep_between_requests = 1.0   # between pages
rows = []   

In [None]:
for country in countries:
    print(f"=== COUNTRY: {country} ===")
    for page in range(max_pages_per_country):
        offset = page * page_size

        # This is the SPARQL query to fetch the data for all the startups in the list of countries with the below mentioned filters.
        sparql = f"""
        SELECT ?item ?itemLabel ?countryLabel ?inception ?dissolved
               ?industryLabel ?hqLabel ?website
               ?ownedByLabel ?memberOfLabel ?competitionLabel
               ?revenue ?revenue_date
        WHERE {{
          ?item wdt:P31/wdt:P279* wd:Q4830453 .
          ?item wdt:P17 {country} .

          OPTIONAL {{ ?item wdt:P571 ?inception. }}
          OPTIONAL {{ ?item wdt:P576 ?dissolved. }}
          OPTIONAL {{ ?item wdt:P452 ?industry. }}
          OPTIONAL {{ ?item wdt:P159 ?hq. }}
          OPTIONAL {{ ?item wdt:P856 ?website. }}
          OPTIONAL {{ ?item wdt:P127 ?ownedBy. }}
          OPTIONAL {{ ?item wdt:P463 ?memberOf. }}
          OPTIONAL {{ ?item wdt:P2410 ?competition. }}

          OPTIONAL {{
            ?item p:P2139 ?rev_stmt .
            ?rev_stmt ps:P2139 ?revenue .
            OPTIONAL {{ ?rev_stmt pq:P585 ?revenue_date . }}
          }}

          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
        }}
        LIMIT {page_size} OFFSET {offset}
        """

        success = False
        attempt = 0

        # Re-attempts 5 times if the first attempt fails due to an error.
        while not success and attempt < 6:
            attempt += 1
            try:
                r = requests.get(
                    ENDPOINT,
                    params={"query": sparql, "format": "json"},
                    headers=HEADERS,
                    timeout=120
                )
                r.raise_for_status()
                data = r.json()["results"]["bindings"]

                print(f"country {country} page {page} -> rows: {len(data)} (attempt {attempt})")

                def val(x):
                    return x["value"] if isinstance(x, dict) and "value" in x else None

                for row in data:
                    flat = {k: val(v) for k, v in row.items()}
                    flat["_country_q"] = country
                    rows.append(flat)

                success = True
            except requests.exceptions.HTTPError as e:
                status = getattr(e.response, "status_code", None)
                print("HTTPError", status, "on attempt", attempt, "- retrying...")
                time.sleep(2 ** attempt)
            except Exception as e:
                print("Other error:", e, "on attempt", attempt, "- retrying...")
                time.sleep(2 ** attempt)

        # stop paging this country if we either failed or we got less than a page
        if not success:
            print("Failed after retries; moving to NEXT country.")
            break

        if len(data) < page_size:
            print("Last page for this country (got fewer than page_size).")
            break

        time.sleep(sleep_between_requests)

    # 10 minutes cooldown after every country to ensure scraping works as intended
    print(f"Cooldown after {country} ...")
    time.sleep(600)


=== COUNTRY: wd:Q40 ===
country wd:Q40 page 0 -> rows: 1000 (attempt 1)
country wd:Q40 page 1 -> rows: 1000 (attempt 1)
country wd:Q40 page 2 -> rows: 1000 (attempt 1)
country wd:Q40 page 3 -> rows: 1000 (attempt 1)
country wd:Q40 page 4 -> rows: 1000 (attempt 1)
country wd:Q40 page 5 -> rows: 1000 (attempt 1)
country wd:Q40 page 6 -> rows: 1000 (attempt 1)
country wd:Q40 page 7 -> rows: 1000 (attempt 1)
country wd:Q40 page 8 -> rows: 1000 (attempt 1)
country wd:Q40 page 9 -> rows: 1000 (attempt 1)
country wd:Q40 page 10 -> rows: 1000 (attempt 1)
country wd:Q40 page 11 -> rows: 326 (attempt 1)
Last page for this country (got fewer than page_size).
Cooldown after wd:Q40 ...
=== COUNTRY: wd:Q183 ===
country wd:Q183 page 0 -> rows: 1000 (attempt 1)
country wd:Q183 page 1 -> rows: 1000 (attempt 1)
country wd:Q183 page 2 -> rows: 1000 (attempt 1)
country wd:Q183 page 3 -> rows: 1000 (attempt 1)
country wd:Q183 page 4 -> rows: 1000 (attempt 1)
country wd:Q183 page 5 -> rows: 1000 (attempt 1)

In [5]:
df_all = pd.DataFrame(rows)
print("Total raw rows fetched:", len(df_all))
df_all.head(10)

Total raw rows fetched: 174711


Unnamed: 0,item,inception,website,itemLabel,industryLabel,hqLabel,ownedByLabel,memberOfLabel,_country_q,dissolved,revenue,revenue_date
0,http://www.wikidata.org/entity/Q83822,1919-11-21T00:00:00Z,https://konzern.oebb.at/,Austrian Federal Railways,rail freight transport,ÖBB Konzernzentrale Hauptbahnhof Wien,Austria,International Union of Railways,wd:Q40,,,
1,http://www.wikidata.org/entity/Q83822,2004-03-31T00:00:00Z,https://konzern.oebb.at/,Austrian Federal Railways,rail freight transport,ÖBB Konzernzentrale Hauptbahnhof Wien,Austria,International Union of Railways,wd:Q40,,,
2,http://www.wikidata.org/entity/Q83822,1919-11-21T00:00:00Z,https://konzern.oebb.at/,Austrian Federal Railways,rail transport,ÖBB Konzernzentrale Hauptbahnhof Wien,Austria,International Union of Railways,wd:Q40,,,
3,http://www.wikidata.org/entity/Q83822,2004-03-31T00:00:00Z,https://konzern.oebb.at/,Austrian Federal Railways,rail transport,ÖBB Konzernzentrale Hauptbahnhof Wien,Austria,International Union of Railways,wd:Q40,,,
4,http://www.wikidata.org/entity/Q83822,1919-11-21T00:00:00Z,https://konzern.oebb.at/,Austrian Federal Railways,"passenger rail transport, interurban (NACE 49.1)",ÖBB Konzernzentrale Hauptbahnhof Wien,Austria,International Union of Railways,wd:Q40,,,
5,http://www.wikidata.org/entity/Q83822,2004-03-31T00:00:00Z,https://konzern.oebb.at/,Austrian Federal Railways,"passenger rail transport, interurban (NACE 49.1)",ÖBB Konzernzentrale Hauptbahnhof Wien,Austria,International Union of Railways,wd:Q40,,,
6,http://www.wikidata.org/entity/Q83822,1919-11-21T00:00:00Z,https://www.oebb.at/,Austrian Federal Railways,rail freight transport,ÖBB Konzernzentrale Hauptbahnhof Wien,Austria,International Union of Railways,wd:Q40,,,
7,http://www.wikidata.org/entity/Q83822,2004-03-31T00:00:00Z,https://www.oebb.at/,Austrian Federal Railways,rail freight transport,ÖBB Konzernzentrale Hauptbahnhof Wien,Austria,International Union of Railways,wd:Q40,,,
8,http://www.wikidata.org/entity/Q83822,1919-11-21T00:00:00Z,https://www.oebb.at/,Austrian Federal Railways,rail transport,ÖBB Konzernzentrale Hauptbahnhof Wien,Austria,International Union of Railways,wd:Q40,,,
9,http://www.wikidata.org/entity/Q83822,2004-03-31T00:00:00Z,https://www.oebb.at/,Austrian Federal Railways,rail transport,ÖBB Konzernzentrale Hauptbahnhof Wien,Austria,International Union of Railways,wd:Q40,,,


In [6]:
df_all.to_csv("wikidata_eu_companies_raw.csv", index=False)