### EU STARTUPS web scraping

In [2]:
import ssl
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

### Test URL request

In [None]:
def get_startup_names_with_user_agent(url):

    ssl._create_default_https_context = ssl._create_unverified_context
    
    # request
    req = Request(
        url=url, 
        headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    )
    
    # Fetch the HTML content
    try:
        with urlopen(req) as response:
            html_content = response.read()
            
    except Exception as e:
        print(f"Error fetching the page using urlopen: {e}")
        return []

    # Parse the HTML content 
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all startup names 
    startup_elements = soup.select('div.listing-title h3 a')
    
    # Extract the text (startup name) from each element
    # This will be used to construct the URLs for teh individual info page of start-ups
    names = [element.get_text(strip=True) for element in startup_elements]
    
    return names

In [None]:
url = "https://www.eu-startups.com/directory/"
startup_names = get_startup_names_with_user_agent(url)

if startup_names:
    print(f"Successfully found {len(startup_names)} startup names.")
    print(startup_names[:10])
else:
    print("Failed to find any startup names.")

Successfully found 11 startup names.
First 10 names:
['wolfo.io', 'Zeabur', 'Guard by OffSeq', 'Gaeilgeoir AI', 'Padelvo', 'LeadWise', 'Kreatli', 'SiWeGO â€“ Smart Eco Transport', 'Lyll', 'AI MADE IN SPAIN']


In [7]:
req = Request(
    url="https://www.eu-startups.com/directory/", 
    headers={'User-Agent': 'Mozilla/5.0'}
)
webpage = urlopen(req).read()

In [8]:
webpage

b'<!--trackwpstats--><!doctype html >\n<html lang="en-US">\n<head>\n<!-- Google Consent Mode: DEFAULT (must be before any other tags) -->\n\n\n    <meta charset="UTF-8" />\n    <title>EU-Startups Database</title>\n    <meta name="viewport" content="width=device-width, initial-scale=1.0">\n    <link rel="pingback" href="https://www.eu-startups.com/xmlrpc.php" />\n    <link rel="shortcut icon" href="https://www.eu-startups.com/wp-content/themes/Newspaperimages/favicon.ico" type="image/x-icon">\n    <link rel="icon" href="https://www.eu-startups.com/wp-content/themes/Newspaperimages/favicon.ico" type="image/x-icon">\n    <style id="jetpack-boost-critical-css">@media all{ul{box-sizing:border-box}.aligncenter{clear:both}.screen-reader-text{clip:rect(1px,1px,1px,1px);word-wrap:normal!important;border:0;-webkit-clip-path:inset(50%);clip-path:inset(50%);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px}}@media all{.screen-reader-text{border:0;clip:rect(1px,1px,1px,1p

### Loop through the pages to get all company names

In [None]:
import time
from urllib.error import URLError, HTTPError
from tqdm import tqdm 


BASE_URL = "https://www.eu-startups.com/directory"
START_PAGE = 2001
END_PAGE = 3059
SLEEP_TIME = 0.5 



In [None]:
def fetch_page_content(page_url):

    ssl._create_default_https_context = ssl._create_unverified_context
    
    # request
    req = Request(
        url=page_url, 
        headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    )
    
    try:
        with urlopen(req, timeout=15) as response:
            return response.read()
            
    except HTTPError as e:
        print(f"\n[ERROR] HTTP Error on {page_url}: {e.code}")
    except URLError as e:
        print(f"\n[ERROR] URL Error on {page_url}: {e.reason}")
    except Exception as e:
        print(f"\n[ERROR] General Error on {page_url}: {e}")
        
    return None

def extract_startup_names(html_content):

    soup = BeautifulSoup(html_content, 'html.parser')

    startup_elements = soup.select('div.listing-title h3 a')
    
    names = [element.get_text(strip=True) for element in startup_elements]
    
    return names

def scrape_all_pages(start, end):

    all_startup_names = []
    
    for page_num in tqdm(range(start, end + 1), desc="Scraping Pages"):
        page_url = f"{BASE_URL}/page/{page_num}"
        
        # ontent
        html_content = fetch_page_content(page_url)
        
        if html_content:
            # get names
            names = extract_startup_names(html_content)
            all_startup_names.extend(names)
        
        # sleep
        time.sleep(SLEEP_TIME)
        
    return all_startup_names



In [None]:
if __name__ == "__main__":
    print(f"Starting scrape from page {START_PAGE} to {END_PAGE}...")
    print(f"Base URL: {BASE_URL}")
    print(f"Delay per request: {SLEEP_TIME} seconds.")
    
    final_list = scrape_all_pages(START_PAGE, END_PAGE)
    
    # --- Final Output ---
    print("\n" + "="*50)
    print(f"Scraping complete. Total unique startup names found: {len(set(final_list))}")
    print(f"Total entries collected (including potential duplicates): {len(final_list)}")
    print("="*50)


Starting scrape from page 2001 to 3059...
Base URL: https://www.eu-startups.com/directory
Delay per request: 0.5 seconds.


Scraping Pages: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1059/1059 [50:12<00:00,  2.84s/it]


Scraping complete. Total unique startup names found: 11425
Total entries collected (including potential duplicates): 11649





In [None]:
# save to TXT
with open('startup_names_3.txt', 'w', encoding='utf-8') as f:
    for name in list(set(final_list)):
        f.write(f"{name}\n")


### Process company names

In [None]:
import os
import re
from typing import List

def normalize_company_names(names: List[str]) -> List[str]:  # process the startup names - later use for URLs
   
    normalized_list = []
    for name in names:
        # decapitalize
        name = name.lower()

        # remove special characters
        name = re.sub(r'[^a-z0-9\s]', '', name)

        # remove whitespaces
        name = name.strip()

        # replace whitespaces with a "-"
        name = re.sub(r'\s+', '-', name)

        # append
        if name:
            normalized_list.append(name)

    # eduplicate and sort
    final_unique_names = sorted(list(set(normalized_list)))
    return final_unique_names

def process_startup_files(file_names: List[str]):  # get the anmes back from the txt files
    
    all_names = []
    error_files = []

    for file_name in file_names:
        try:
            with open(file_name, 'r', encoding='utf-8') as f:
                names_from_file = [line.strip() for line in f if line.strip()]
                all_names.extend(names_from_file)
            print(f"Successfully read {len(names_from_file)} names from '{file_name}'.")
        except FileNotFoundError:
            error_files.append(file_name)
            print(f"ERROR: File not found: '{file_name}'. Skipping.")
        except Exception as e:
            error_files.append(file_name)
            print(f"An unexpected error occurred reading '{file_name}': {e}")

    if not all_names:
        print("\nNo names were read from any file. Please check file paths and content.")
        return

    # normalize and deduplicate
    final_normalized_names = normalize_company_names(all_names)

    print(f"Total names read (including duplicates): {len(all_names)}")
    print(f"Total unique normalized names: {len(final_normalized_names)}")

    return final_normalized_names



In [None]:
# get all company names from the txt files and process them and get one list at the end

if __name__ == "__main__":

    startup_files = ['startup_names_1.txt', 'startup_names_2.txt', 'startup_names_3.txt']

    processed_names = process_startup_files(startup_files)

Successfully read 10711 names from 'startup_names_1.txt'.
Successfully read 10697 names from 'startup_names_2.txt'.
Successfully read 11425 names from 'startup_names_3.txt'.
Total names read (including duplicates): 32833
Total unique normalized names: 32454


In [3]:
print(processed_names)

['01-founders', '011h', '0credit', '0mission', '0w1-audio', '0xcert', '1-account', '1-brave-friend', '1-week', '10-days-london', '10-dots-only', '100-words', '1000farmacie', '1000hotelsnet', '1000kelvin', '100asa', '100etiqueta', '1080-coaching', '10ai', '10bn', '10k-riders', '10lift', '10lines', '10mets', '10mg-pharma', '10x-banking', '10xbanking', '10xbeast', '11fs', '11onze', '123-shared-mobility', '123scoop', '123tintaes', '12tree', '14peaks', '14trees', '15talent', '15togo', '16th-ventures', '17tons', '1859-cloud', '18wheels', '1928-diagnostics', '1crew', '1energy', '1id', '1komma5', '1market', '1millionbot', '1nce', '1newhomescom', '1nfluencersmarketing', '1ot', '1rebel', '1roof', '1secondpainting', '1stopvat', '1transcribe', '1way-interview', '1xcom', '2-digits-growth', '20-midtown', '200crowd', '2021ai', '202up', '2030-builders', '2050-materials', '2050cards', '20robots', '20seconds', '20treeai', '2100ws', '21bitcoin', '21buttons', '21factorys', '21risk', '21shares', '21strateg

### Single company data request

In [None]:
import pandas as pd
from typing import Dict, Any

In [None]:


def scrape_eu_startups_profile(url: str) -> pd.DataFrame:   # actual scraping of predefined fields
 
    ssl._create_default_https_context = ssl._create_unverified_context
    
    # data dict
    data: dict = {
        'company name': url.split('/')[-2].replace('-', ' ').title(),
    }
    try:
        # request
        req = Request(
            url=url, 
            headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        )
        
        # pen URL
        with urlopen(req) as response:
            html_content = response.read()
            soup = BeautifulSoup(html_content, 'html.parser')
     
 
        field_mapping = {
            # Map HTML field class o columns
            'wpbdp-field-category': 'category',
            'wpbdp-field-based_in': 'based in', # Assuming 'location' field is 'based in'
            'wpbdp-field-tags': 'tags',
            'wpbdp-field-total_funding': 'total funding',
            'wpbdp-field-founded': 'founded',
            'wpbdp-field-website': 'website',
            'wpbdp-field-company_status': 'company status',
        }

        for class_name, key_name in field_mapping.items():
            # Find the main div element for the field
            field_div = soup.find('div', class_=lambda c: c and class_name in c.split())
            if field_div:
                # Find the value div (which contains the data)
                value_div = field_div.find('div', class_='value')
                if value_div:
                    if key_name == 'website':
                        # Special handling for website to get the actual link (href)
                        link = value_div.find('a')
                        data[key_name] = link.get('href') if link and link.get('href') else value_div.get_text(strip=True)
                    else:
                        data[key_name] = value_div.get_text(strip=True)
                else:
                    data[key_name] = 'N/A'
            else:
                data[key_name] = 'N/A'

        # convert to DF
        df = pd.DataFrame([data])
        return df

    except Exception as e:
        print(f"An error occurred during scraping: {e}")
        # Return an empty DataFrame on error
        return pd.DataFrame()



In [None]:
def scrape_multiple_profiles(urls: List[str]) -> pd.DataFrame:  # loop through URLs and scrape data

    all_dfs = []
    
    print(f"Starting scrape of {len(urls)} URLs...")
    
    for i, url in enumerate(urls):
        print(f"Processing ({i+1}/{len(urls)}): {url}...")
        
        # Run the scraping function
        company_data_df = scrape_eu_startups_profile(url)
        
        if not company_data_df.empty:
            all_dfs.append(company_data_df)
            print(f"Success: {company_data_df['company name'].iloc[0]}")
        else:
            print("Failed to retrieve data for this URL.")

    # combine to one dataframe
    if all_dfs:
        final_df = pd.concat(all_dfs, ignore_index=True)
        return final_df
    else:
        print("No data was successfully scraped.")
        return pd.DataFrame()

In [None]:
BASE_URL = "https://www.eu-startups.com/directory/"

# construct all URLs from company names
url_list_full = [f"{BASE_URL}{name}/" for name in processed_names]

In total around 32,000 companies - and URLs

In [8]:
len(url_list_full)

32454

Loop through these URLs - only a few thousand at a time, otherwise it crashes - to get the information for individual companies to predefined columns. Save dataframes (each 1000 request is a new dataframe).

In [None]:

for i in range(30, 32):

    start = i*1000
    end = (i+1)*1000 -1

    url_list_chunk = url_list_full[start:end]

    if __name__ == "__main__":
        # URLs to scrape
        url_list = url_list_chunk

        # ain loop
        final_startup_df = scrape_multiple_profiles(url_list)

        # inal combined df
        if not final_startup_df.empty:
            print("\n" + "="*50)
            print("FINAL COMBINED STARTUP DATA")
            print("="*50)
            
            filename = f'startup_data_{i}.csv'
            final_startup_df.to_csv(filename, index=False)

        time.sleep(10)

Starting scrape of 999 URLs...
Processing (1/999): https://www.eu-startups.com/directory/vantis/...
Success: Vantis
Processing (2/999): https://www.eu-startups.com/directory/vantta/...
Success: Vantta
Processing (3/999): https://www.eu-startups.com/directory/vaonis/...
Success: Vaonis
Processing (4/999): https://www.eu-startups.com/directory/vara/...
Success: Vara
Processing (5/999): https://www.eu-startups.com/directory/varez/...
Success: Varez
Processing (6/999): https://www.eu-startups.com/directory/varia/...
Success: Varia
Processing (7/999): https://www.eu-startups.com/directory/variant/...
Success: Variant
Processing (8/999): https://www.eu-startups.com/directory/varig/...
Success: Varig
Processing (9/999): https://www.eu-startups.com/directory/variled/...
Success: Variled
Processing (10/999): https://www.eu-startups.com/directory/varjo/...
Success: Varjo
Processing (11/999): https://www.eu-startups.com/directory/varm/...
Success: Varm
Processing (12/999): https://www.eu-startups

In [7]:
df = pd.read_csv("startup_data_0.csv")
df

Unnamed: 0,company name,category,based in,tags,total funding,founded,website,company status
0,01 Founders,UK,London,"AI, Blockchain, Cybersecurity, Gaming",No funding announced yet,2021,https://www.01founders.co/,
1,011H,Spain,Barcelona,"Construction, Sustainability, Carbon Footprint",Between â‚¬10 million-â‚¬25 million,2020,http://www.011h.com/,
2,0Credit,Belgium,Brussels,"Finance, FinTech, Property Management, Real Es...",No funding announced yet,2020,www.0credit.be,
3,0W1 Audio,France,Aix les Milles,"Music Tech, IoT, Wireless Speaker, Smart Audio...",,2017,www.0W1audio.com,
4,0Xcert,Slovenia,Ljubljana,"Blockchain, Software,",No funding announced yet,2018,https://0xcert.org/,
...,...,...,...,...,...,...,...,...
927,Airsupply,UK,London,"Education, Human Resources, Marketplace",Between â‚¬100K-â‚¬500K,2016,https://www.airsupply.org.uk/,
928,Airt,Croatia,Zagreb,"Artificial Intelligence, Information Technolog...",No funding announced yet,2019,https://airt.ai/,
929,Airtame,Denmark,"Indre By, Copenhagen","Wireless, Streaming, Hardware, HDMI, Airtame, ...",,2013,www.airtame.com,
930,Airthium,France,Les Loges en Josas,"energy storage, heat pump, deep tech",Between â‚¬1 million-â‚¬ 2.5 million,2016,airthium.com,
