In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import os

headers = {
    'User-Agent': 'Mozilla/5.0'
}

OUTPUT_FILE = "german_data_startups_all_available_with_linkedin.csv"

# Extract detailed company info from the company page
def extract_company_details(company_url):
    try:
        response = requests.get(company_url, headers=headers)
        if response.status_code != 200:
            print(f" Failed to fetch company page: {response.status_code}")
            return {
                "Based in": "", 
                "Website": "", 
                "Company Status": "", 
                "LinkedIn": ""
            }
    except Exception as e:
        print(f" Error fetching company page: {e}")
        return {
            "Based in": "", 
            "Website": "", 
            "Company Status": "", 
            "LinkedIn": ""
        }

    soup = BeautifulSoup(response.text, 'html.parser')

    def get_field(label):
        for span in soup.find_all('span', class_='field-label'):
            if label in span.text:
                val = span.find_next_sibling('div', class_='value')
                return val.get_text(strip=True) if val else ""
        return ""

    #  Try to find the LinkedIn link
    linkedin_link = ""
    linkedin_icon = soup.find('img', alt='linkedin')
    if linkedin_icon:
        parent_link = linkedin_icon.find_parent('a')
        if parent_link and 'linkedin.com' in parent_link['href']:
            linkedin_link = parent_link['href'].strip()

    return {
        "Based in": get_field("Based in"),
        "Website": get_field("Website"),
        "Company Status": get_field("Company Status"),
        "LinkedIn": linkedin_link
    }

# Scrape all company listings from all pages
def get_startups():
    startups = []
    seen_names = set()
    page = 1

    while True:
        url = (
            f"https://www.eu-startups.com/directory/page/{page}/"
            f"?dosrch=1&q=&wpbdp_view=search"
            f"&listingfields[2]=628"
            f"&listingfields[7]=Data"
        )

        print(f" Page {page} â†’ {url}")
        try:
            response = requests.get(url, headers=headers)
            if response.status_code != 200:
                print(f" Status code {response.status_code}. Stopping.")
                break
        except Exception as e:
            print(f" Request failed: {e}. Stopping.")
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        listings = soup.find_all('div', class_='wpbdp-listing')

        if not listings:
            print(" No listings found on this page. All pages done.")
            break

        for listing in listings:
            try:
                h3 = listing.find('h3')
                if not h3 or not h3.a:
                    continue

                name = h3.a.get_text(strip=True)
                link = h3.a['href']
                if name in seen_names:
                    continue

                # Extract founding year
                founded = ""
                for span in listing.find_all('span', class_='field-label'):
                    if 'Founded' in span.text:
                        val = span.find_next_sibling('div', class_='value')
                        if val:
                            founded = val.text.strip()
                        break

                # Get extra details from the individual company page
                details = extract_company_details(link)

                startups.append({
                    'Name': name,
                    'Link': link,
                    'Founded': founded,
                    'Based in': details['Based in'],
                    'Website': details['Website'],
                    'Company Status': details['Company Status'],
                    'LinkedIn': details['LinkedIn']
                })

                seen_names.add(name)
                print(f" {name} | {link} | Founded: {founded} | Based in: {details['Based in']} | Website: {details['Website']} | Status: {details['Company Status']} | LinkedIn: {details['LinkedIn']}")
            except Exception as e:
                print(f" Error parsing listing: {e}")

        page += 1
        time.sleep(1)

    return startups

# Save all scraped data to a CSV
def save_to_csv(startups, filename=OUTPUT_FILE):
    try:
        with open(filename, mode='w', newline='', encoding='utf-8') as file:
            fieldnames = ['Name', 'Link', 'Founded', 'Based in', 'Website', 'Company Status', 'LinkedIn']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(startups)
        print(f"\n Saved {len(startups)} startups to '{filename}'")
        print(f" File path: {os.path.abspath(filename)}")
    except Exception as e:
        print(f" Failed to write CSV: {e}")

# Run it
if __name__ == "__main__":
    data = get_startups()
    save_to_csv(data)
