In [1]:
import re
import time
import pandas as pd
import numpy as np
import requests
import pickle
from bs4 import BeautifulSoup

# Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# Opcional per evitar warnings de HTTPS
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


### **Extracción de empresas**

In [2]:
data = []
for year in [2025, 2024, 2023, 2022]:
    for pa in range(1, 8):
        url = f"https://startupshub.catalonia.com/investments-in-catalan-startups?pageNumber={pa}&year={year}"
        resp = requests.post(url, verify=False)
        soup = BeautifulSoup(resp.text, "html.parser")
    
        # Buscar todos los elementos con la clase 'items'
        for item in soup.find_all("div", class_="items"):
            # Extraer la URL desde el atributo `url` en la etiqueta <a>
            url_element = item.find("a", attrs={"url": True})
            url = url_element["url"] if url_element else None

            # Extraer información
            startup_name = item.find("h4").text.strip() if item.find("h4") else None
            company_name = item.find("div", class_="item-text").find_all("p")[0].text.strip()
            category = item.find("div", class_="item-text").find_all("p")[1].text.strip()
            investment_amount = item.find("div", class_="col-md-4").find("p").text.strip()
        
            investors_text = item.find("div", class_="col-md-4").find("strong")
            investors = investors_text.find_next_sibling(string=True).strip() if investors_text else None
        
            date = item.find("p", class_="date").text.strip() if item.find("p", class_="date") else None

            # Agregar datos a la lista
            data.append([startup_name, company_name, category, investment_amount, investors, date, url])

# Crear DataFrame
df = pd.DataFrame(data, columns=["Startup", "Company", "Category", "Investment", "Investors", "Date", "URL"])

In [None]:
print(df)

### **Extracción de info de cada empresa**

In [4]:
lista = []

In [None]:

for i in range(0, df.shape[0]):
    url = "https://startupshub.catalonia.com/" + df.loc[i, "URL"]
    print(f"Processant [{i}]: {url}")

    try:
        # Inicialitzar driver i accedir a la pàgina
        driver = webdriver.Chrome()
        driver.get(url)
        time.sleep(2)  # petita pausa per permetre càrrega

        # Petició per fer servir BeautifulSoup
        resp = requests.get(url, verify=False)
        soup = BeautifulSoup(resp.text, "html.parser")

        # Comprovació de l'adreça
        address_element = soup.find('strong', class_='company-name')
        if not address_element or not address_element.find_next_sibling(string=True):
            print(f"No hi ha adreça. Es salta.")
            continue
        address = address_element.find_next_sibling(string=True).strip()

        # Dades estàtiques via BeautifulSoup
        name = soup.find('h1', class_='big_title').text.strip() if soup.find('h1', class_='big_title') else None
        description = soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else None
        founded = soup.find('span', class_='founded').find('strong').text.strip() if soup.find('span', class_='founded') else None
        employees = soup.find('span', class_='employers').find('strong').text.strip() if soup.find('span', class_='employers') else None

        # Camps opcionals amb control
        def safe_text(selector, label=''):
            element = soup.find('span', class_=selector)
            return element.text.replace(label, '').strip() if element else None

        industries = safe_text('industries information-item', 'Industries:')
        technologies = safe_text('technologies information-item', 'Technologies:')
        others_fields = safe_text('otherFields information-item', 'Other fields:')
        financial_founded = safe_text('founded information-item', 'Founded:')
        business_model = safe_text('model information-item', 'Business model:')
        targets = safe_text('target information-item', 'Target:')
        spinoff = safe_text('spinoffs information-item', 'Spinoff participants:')

        # Rondas de finançament
        funding_info = []
        for funding in soup.find_all('div', class_='item punts-servei-content row-fluid'):
            amount = funding.find('h3').text.strip()
            date = amount.split('(')[-1].replace(')', '')
            funding_info.append({'amount': amount, 'date': date})

        # Elements dinàmics amb Selenium + WebDriverWait
        try:
            financial_funding_stage = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "fundingStageTxt"))
            ).text
        except TimeoutException:
            print("No s'ha trobat fundingStageTxt")
            financial_funding_stage = None

        try:
            financial_employees = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "employeesTxt"))
            ).text
        except TimeoutException:
            print("No s'ha trobat employeesTxt")
            financial_employees = None

        try:
            founding = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, "generic-container"))
            )[0].text
        except TimeoutException:
            print("No s'ha trobat generic-container")
            founding = None

        # Guardar dades
        data = {
            'Name': [name],
            'Description': [description],
            'Address': [address],
            'Industries': [industries],
            'Technologies': [technologies],
            'Other fields': [others_fields],
            'Funding stage': [financial_funding_stage],
            'Founded': [financial_founded],
            'Employees': [financial_employees],
            'Business model': [business_model],
            'Target': [targets],
            'Spinoff participants': [spinoff],
            'Funding': [funding_info],
            'Funding2': [founding]
        }

        lista.append(data)

    except Exception as e:
        print(f"Error inesperat a [{i}]: {e}")

    finally:
        driver.quit()


In [None]:
import pickle

with open('data/lista.pkl', 'wb') as f:
    pickle.dump(lista, f)

In [4]:
df_info = []
with open('data/lista.pkl', 'rb') as f:
    lista = pickle.load(f)
df_info = pd.DataFrame(lista)

In [5]:
df_info = pd.DataFrame(lista)
df_info["Name"] = [d[0] for d in df_info["Name"]]
df_info["Description"] = [d[0] for d in df_info["Description"]]
df_info["Address"] = [d[0] for d in df_info["Address"]]
df_info["Industries"] = [d[0] for d in df_info["Industries"]]
df_info["Technologies"] = [d[0] for d in df_info["Technologies"]]
df_info["Other fields"] = [d[0] for d in df_info["Other fields"]]
df_info["Funding stage"] = [d[0] for d in df_info["Funding stage"]]
df_info["Founded"] = [d[0] for d in df_info["Founded"]]
df_info["Employees"] = [d[0] for d in df_info["Employees"]]
df_info["Business model"] = [d[0] for d in df_info["Business model"]]
df_info["Target"] = [d[0] for d in df_info["Target"]]
df_info["Spinoff participants"] = [d[0] for d in df_info["Spinoff participants"]]

In [6]:
datosFinal = df.merge(df_info, left_on='Company', right_on='Name')

In [7]:
dades = datosFinal[~datosFinal.Name.duplicated(keep='first')]
dades = dades[~dades.Funding2.duplicated(keep='first')]
dades = dades.reset_index(drop=True)


In [10]:
conteoRegistros = []
capitales = []
investors = []

for j in range(dades.shape[0]):
    lista = dades.loc[j, "Funding2"][0].split("\n")
    
    capital_vals = []
    investor_vals = []
    
    i = 1  # comencem en la posició 1 (el segon element)
    while i + 1 < len(lista):  # assegurem que hi ha com a mínim 3 elements (diners, capital, investor)
        valor_diners = lista[i]
        etiqueta_capital = lista[i + 1].lower().strip()
        
        # Si l’etiqueta és 'capital' (o similar) i no conté 'source', l’afegim
        if (
            any(etiqueta_capital.startswith(variant) for variant in ['capital', 'cpaital', 'minority stake']) and
            'source' not in etiqueta_capital
        ):
            capital_vals.append(valor_diners)
            
            if i + 2 < len(lista):
                investor_vals.append(lista[i + 2].replace('Investors:', '').strip())
        
        i += 3  # avancem al següent bloc de 3

    conteoRegistros.append(len(capital_vals))
    capitales.append(capital_vals)
    investors.append(investor_vals)

# Aplanem les llistes
capitales = [item for sublista in capitales for item in sublista]
investors = [item for sublista in investors for item in sublista]

print(len(capitales))
print(len(investors))


622
622


In [11]:
df_repetido = dades.loc[dades.index.repeat(conteoRegistros)].reset_index(drop=True)
df_repetido["capital_prev"] = capitales
df_repetido["investors"] = investors

In [12]:
datosFinal = df_repetido.drop(columns=['Funding', 'Funding2'])

In [11]:
datosFinal

Unnamed: 0,Startup,Company,Category,Investment,Investors,Date,URL,Name,Description,Address,...,Technologies,Other fields,Funding stage,Founded,Employees,Business model,Target,Spinoff participants,capital_prev,investors
0,TRAVELPERK,TRAVELPERK SL,Traveltech & Leisure,190M €,"Atomico, EQT Growth, Kinnevik",January 2025,startup/barcelona/travelperk/1195,TRAVELPERK SL,Travelperk is a business travel platform pione...,"Avinguda Diagonal, 211 Floor 11 08018 Barcelona",...,"AI & Big Data, \r\n\t\t \t\t\r\...",Scaleup,Series C (20 M EUR - 200 M EUR),2015,More than 50,Saas,Business,None / Not a spinoff,190M € (JANUARY 2025),"Atomico, EQT Growth, Kinnevik"
1,TRAVELPERK,TRAVELPERK SL,Traveltech & Leisure,190M €,"Atomico, EQT Growth, Kinnevik",January 2025,startup/barcelona/travelperk/1195,TRAVELPERK SL,Travelperk is a business travel platform pione...,"Avinguda Diagonal, 211 Floor 11 08018 Barcelona",...,"AI & Big Data, \r\n\t\t \t\t\r\...",Scaleup,Series C (20 M EUR - 200 M EUR),2015,More than 50,Saas,Business,None / Not a spinoff,95M € (JANUARY 2024),"Softbank, Kinnevik, Felix Capital"
2,TRAVELPERK,TRAVELPERK SL,Traveltech & Leisure,190M €,"Atomico, EQT Growth, Kinnevik",January 2025,startup/barcelona/travelperk/1195,TRAVELPERK SL,Travelperk is a business travel platform pione...,"Avinguda Diagonal, 211 Floor 11 08018 Barcelona",...,"AI & Big Data, \r\n\t\t \t\t\r\...",Scaleup,Series C (20 M EUR - 200 M EUR),2015,More than 50,Saas,Business,None / Not a spinoff,241.9M € (JANUARY 2022),"Kinnevik, General Catalyst Partners, business ..."
3,TRAVELPERK,TRAVELPERK SL,Traveltech & Leisure,190M €,"Atomico, EQT Growth, Kinnevik",January 2025,startup/barcelona/travelperk/1195,TRAVELPERK SL,Travelperk is a business travel platform pione...,"Avinguda Diagonal, 211 Floor 11 08018 Barcelona",...,"AI & Big Data, \r\n\t\t \t\t\r\...",Scaleup,Series C (20 M EUR - 200 M EUR),2015,More than 50,Saas,Business,None / Not a spinoff,132M € (APRIL 2021),Greyhound Capital
4,TRAVELPERK,TRAVELPERK SL,Traveltech & Leisure,190M €,"Atomico, EQT Growth, Kinnevik",January 2025,startup/barcelona/travelperk/1195,TRAVELPERK SL,Travelperk is a business travel platform pione...,"Avinguda Diagonal, 211 Floor 11 08018 Barcelona",...,"AI & Big Data, \r\n\t\t \t\t\r\...",Scaleup,Series C (20 M EUR - 200 M EUR),2015,More than 50,Saas,Business,None / Not a spinoff,53M € (JULY 2019),"KINNEVIK, PARTNERS OF DST GLOBAL, TARGET GLOBA..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,ZOUNDREAM,ZOUNDREAM S.L.,HealthtechBusiness Services & Software,,n.a.,May 2022,startup/barcelona/zoundream/5965,ZOUNDREAM S.L.,Zoundream revolutionizes infant healthcare wit...,Carrer de Pallars 108 08018 Barcelona,...,"AI & Big Data, \r\n\t\t \t\t\r\...",Scaleup,Series A (1 M EUR - < 5 M EUR),2019,From 11 to 20,"Saas, \nSoftware licence, \nSubscription","Business, \nConsumer",None / Not a spinoff,(MAY 2022),n.a.
619,ZOUNDREAM,ZOUNDREAM S.L.,HealthtechBusiness Services & Software,,n.a.,May 2022,startup/barcelona/zoundream/5965,ZOUNDREAM S.L.,Zoundream revolutionizes infant healthcare wit...,Carrer de Pallars 108 08018 Barcelona,...,"AI & Big Data, \r\n\t\t \t\t\r\...",Scaleup,Series A (1 M EUR - < 5 M EUR),2019,From 11 to 20,"Saas, \nSoftware licence, \nSubscription","Business, \nConsumer",None / Not a spinoff,(DECEMBER 2020),Mision Gate
620,SILT,SILT DIGITAL ID SL,Business Services & SoftwareLegaltech,,4Founders Capital,March 2022,startup/barcelona/silt/5339,SILT DIGITAL ID SL,User verification through a ai based digital id.,Carrer Escipió 22bis Entlo 4a 08023 Barcelona,...,"AI & Big Data, \r\n\t\t \t\t\r\...","Social Economy, \nDeeptech",Seed (0.25 M EUR - < 1 M EUR),2020,Undisclosed,R+D+I Services,-,None / Not a spinoff,(MARCH 2022),4Founders Capital
621,CONKAU,"CONSTRUMARKET DIGITAL, S.L.",Business Services & Software,,Antai Ventures,January 2022,startup/barcelona/conkau/6045,"CONSTRUMARKET DIGITAL, S.L.",Platform that optimizes purchasing processes i...,"Plaça Pau Vila, 1 P. 1, Sector Ad., Of. 1 Ad 0...",...,Connectivity,-,Pre-seed (< 0.25 M EUR),2023,From 6 to 10,Saas,Business,None / Not a spinoff,(JANUARY 2022),Antai Ventures


In [None]:
datosFinal.to_csv('data/datosFinal.csv', index=False)
datosFinal.to_pickle("data/datosFinal.pkl") 