In [1]:
import re
import pandas as pd
import numpy as np

# Paquetes llamadas 
import time

# Scrapping 
import requests

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### **Extracción de empresas**

In [6]:
data = []
for year in range(2022, 2026):
    for pa in range(1, 8):
        url = f"https://startupshub.catalonia.com/investments-in-catalan-startups?pageNumber={pa}&year={year}"
        resp = requests.post(url, verify=False)
        soup = BeautifulSoup(resp.text, "html.parser")
    
        # Buscar todos los elementos con la clase 'items'
        for item in soup.find_all("div", class_="items"):
            # Extraer la URL desde el atributo `url` en la etiqueta <a>
            url_element = item.find("a", attrs={"url": True})
            url = url_element["url"] if url_element else None

            # Extraer información
            startup_name = item.find("h4").text.strip() if item.find("h4") else None
            company_name = item.find("div", class_="item-text").find_all("p")[0].text.strip()
            category = item.find("div", class_="item-text").find_all("p")[1].text.strip()
            investment_amount = item.find("div", class_="col-md-4").find("p").text.strip()
        
            investors_text = item.find("div", class_="col-md-4").find("strong")
            investors = investors_text.find_next_sibling(string=True).strip() if investors_text else None
        
            date = item.find("p", class_="date").text.strip() if item.find("p", class_="date") else None

            # Agregar datos a la lista
            data.append([startup_name, company_name, category, investment_amount, investors, date, url])

# Crear DataFrame
df = pd.DataFrame(data, columns=["Startup", "Company", "Category", "Investment", "Investors", "Date", "URL"])



In [7]:
print(df)

                Startup                                       Company  \
0                PAYFIT                  PAYFIT RECURSOS HUMANOS S.L.   
1               SEEDTAG                       SEEDTAG ADVERTISING SL.   
2                 PAACK                            PAACK LOGISTICS SL   
3               IMPRESS                              SMILE2IMPRESS SL   
4    CELESTIA AEROSPACE                        CELESTIA AEROSPACE SL.   
..                  ...                                           ...   
461        SECRETSVAULT  LIMITLESS TECHNOLOGIES AND APPLICATIONS S.L.   
462               VOILA                                VOILAAA BIO SL   
463       TIRECATHEALTH                             TIRECAT HEALTH SL   
464             NECTIOS                     COPERNIC TECHNOLOGIES, SL   
465           OMNISCOPE                         OMNISCOPE ESPAÑA, SL.   

                                              Category Investment  \
0                         Business Services & Software

### **Extracción de info de cada empresa**

In [5]:
lista = []

In [12]:
for i in range(0, df.shape[0]):
    # Configurar el driver (asegúrate de poner la ruta correcta a tu WebDriver)
    driver = webdriver.Chrome()  # Usa webdriver.Firefox() si usas Firefox

    # Nos quedamos con la url 
    url = "https://startupshub.catalonia.com/" + df.loc[i, "URL"]

    # Extraemos la información de la URL
    resp = requests.get(url, verify=False)
    soup = BeautifulSoup(resp.text, "html.parser")

    address_element = soup.find('strong', class_='company-name')
    if not address_element or not address_element.find_next_sibling(string=True):
        print(f"No hi ha adreça per a la startup de la fila {i}, es salta.")
        driver.quit()
        continue
    
    # Extraer información relevante
    name = soup.find('h1', class_='big_title').text.strip()
    description = soup.find('meta', {'name': 'description'})['content']
    address = soup.find('strong', class_='company-name').find_next_sibling(string=True).strip()
    founded = soup.find('span', class_='founded').find('strong').text.strip()
    employees = soup.find('span', class_='employers').find('strong').text.strip()
    industries = soup.find('span', class_='industries information-item').text.replace('Industries:', '').strip()
    technologies = soup.find('span', class_='technologies information-item').text.replace('Technologies:', '').strip()
    others_fields = soup.find('span', class_='otherFields information-item').text.replace('Other fields:', '').strip()
    financial_founded = soup.find('span', class_='founded information-item').text.replace('Founded:', '').strip()
    business_model = soup.find('span', class_='model information-item').text.replace('Business model:', '').strip()
    targets = soup.find('span', class_='target information-item').text.replace('Target:', '').strip()
    spinoff = soup.find('span', class_='spinoffs information-item').text.replace('Spinoff participants:', '').strip()
    funding_info = []
    for funding in soup.find_all('div', class_='item punts-servei-content row-fluid'):
        amount = funding.find('h3').text.strip()
        date = funding.find('h3').text.strip().split('(')[-1].replace(')', '')
        # investors = funding.find('p', text=lambda t: t and 'Investors:' in t).text.replace('Investors:', '').strip()
        # funding_info.append({'amount': amount, 'date': date, 'investors': investors})
        funding_info.append({'amount': amount, 'date': date})

    # Cargar la página
    time.sleep(10)
    driver.get(url)
    
    financial_funding_stage = soup.find('span', class_='stage information-item').text.replace('Funding stage:', '').strip()
    financial_employees = soup.find('span', class_='employers information-item').text.replace('Employees:', '').strip()

    financial_funding_stage = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "fundingStageTxt"))).text
    financial_employees = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "employeesTxt"))).text

    founding = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "generic-container")))[0].text

    # Cerrar el navegador
    driver.quit()

    data = {
            'Name': [name],
            'Description': [description],
            'Address': [address],
            'Industries': [industries],
            'Technologies': [technologies],
            'Other fields': [others_fields],
            'Funding stage': [financial_funding_stage],
            'Founded': [financial_founded],
            'Employees': [financial_employees],
            'Business model': [business_model],
            'Target': [targets],
            'Spinoff participants': [spinoff],
            'Funding': [funding_info], 
            'Funding2': [founding]
        }
    
    lista.append(data)



No hi ha adreça per a la startup de la fila 4, es salta.




: 

In [9]:
df_info = pd.DataFrame(lista)
print(df_info)

                                      Name  \
0             [SATELIO IOT SERVICES, S.L.]   
1  [DEEPULL DIAGNOSTICS SOCIEDAD LIMITADA]   
2                                 [FLANKS]   
3                        [AUTOCAFLER, S.L]   

                                         Description  \
0                   [Iot connectivity via satellite]   
1  [Deepull is a development-stage clinical diagn...   
2  [Flanks is a provider of wealth management sol...   
3  [Cafler is the first online platform that pass...   

                                             Address  \
0  [Rambla De Catalunya, 124, 2n 3ra 08014 Barcel...   
1      [Carrer Baldiri Reixac, 4- 8 08028 Barcelona]   
2  [Carrer De Tarragona, 157, 16a Planta 08014 Ba...   
3          [Carrer Dels Caponata, 8 08034 Barcelona]   

                                          Industries  \
0  [ICT & Mobile, \r\n\t\t                \t\t\r\...   
1                                       [Healthtech]   
2                     [Business Service

In [6]:
df_info = pd.DataFrame(lista)
df_info["Name"] = [d[0] for d in df_info["Name"]]
df_info["Description"] = [d[0] for d in df_info["Description"]]
df_info["Address"] = [d[0] for d in df_info["Address"]]
df_info["Industries"] = [d[0] for d in df_info["Industries"]]
df_info["Technologies"] = [d[0] for d in df_info["Technologies"]]
df_info["Other fields"] = [d[0] for d in df_info["Other fields"]]
df_info["Funding stage"] = [d[0] for d in df_info["Funding stage"]]
df_info["Founded"] = [d[0] for d in df_info["Founded"]]
df_info["Employees"] = [d[0] for d in df_info["Employees"]]
df_info["Business model"] = [d[0] for d in df_info["Business model"]]
df_info["Target"] = [d[0] for d in df_info["Target"]]
df_info["Spinoff participants"] = [d[0] for d in df_info["Spinoff participants"]]

In [7]:
df_info = df_info.drop_duplicates(subset=['Name'], keep='first')

In [88]:
datosFinal = df.merge(df_info, left_on='Company', right_on='Name')

In [89]:
conteoRegistros = []
capitales = []
investors = []

for j in range(datosFinal.shape[0]):
    lista = datosFinal.loc[j, "Funding2"][0].split("\n")
    capital_prev_values = [lista[i - 1] for i, val in enumerate(lista) if val.lower() == 'capital' and i > 0]
    investors_cleaned = [val.replace('Investors:', '').strip() for val in lista if 'Investors' in val]
    conteoRegistros.append(len(capital_prev_values))
    capitales.append(capital_prev_values)
    investors.append(investors_cleaned)

capitales = [item for sublista in capitales for item in sublista]
investors = [item for sublista in investors for item in sublista]

In [90]:
df_repetido = datosFinal.loc[datosFinal.index.repeat(conteoRegistros)].reset_index(drop=True)
df_repetido["capital_prev"] = capitales
df_repetido["investors"] = investors

In [94]:
datosFinal = df_repetido.drop(columns=['Funding', 'Funding2'])

In [None]:
datosFinal.to_csv('datosFinal.csv', index=False)
datosFinal.to_pickle("./datosFinal.pkl") 