In [15]:
import re
import time
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

# Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# Opcional per evitar warnings de HTTPS
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


### **Extracción de empresas**

In [None]:
data = []
for year in range(2022, 2026):
    for pa in range(1, 8):
        url = f"https://startupshub.catalonia.com/investments-in-catalan-startups?pageNumber={pa}&year={year}"
        resp = requests.post(url, verify=False)
        soup = BeautifulSoup(resp.text, "html.parser")
    
        # Buscar todos los elementos con la clase 'items'
        for item in soup.find_all("div", class_="items"):
            # Extraer la URL desde el atributo `url` en la etiqueta <a>
            url_element = item.find("a", attrs={"url": True})
            url = url_element["url"] if url_element else None

            # Extraer información
            startup_name = item.find("h4").text.strip() if item.find("h4") else None
            company_name = item.find("div", class_="item-text").find_all("p")[0].text.strip()
            category = item.find("div", class_="item-text").find_all("p")[1].text.strip()
            investment_amount = item.find("div", class_="col-md-4").find("p").text.strip()
        
            investors_text = item.find("div", class_="col-md-4").find("strong")
            investors = investors_text.find_next_sibling(string=True).strip() if investors_text else None
        
            date = item.find("p", class_="date").text.strip() if item.find("p", class_="date") else None

            # Agregar datos a la lista
            data.append([startup_name, company_name, category, investment_amount, investors, date, url])

# Crear DataFrame
df = pd.DataFrame(data, columns=["Startup", "Company", "Category", "Investment", "Investors", "Date", "URL"])

In [14]:
print(df)

                Startup                                       Company  \
0                PAYFIT                  PAYFIT RECURSOS HUMANOS S.L.   
1               SEEDTAG                       SEEDTAG ADVERTISING SL.   
2                 PAACK                            PAACK LOGISTICS SL   
3               IMPRESS                              SMILE2IMPRESS SL   
4    CELESTIA AEROSPACE                        CELESTIA AEROSPACE SL.   
..                  ...                                           ...   
461        SECRETSVAULT  LIMITLESS TECHNOLOGIES AND APPLICATIONS S.L.   
462               VOILA                                VOILAAA BIO SL   
463       TIRECATHEALTH                             TIRECAT HEALTH SL   
464             NECTIOS                     COPERNIC TECHNOLOGIES, SL   
465           OMNISCOPE                         OMNISCOPE ESPAÑA, SL.   

                                              Category Investment  \
0                         Business Services & Software

### **Extracción de info de cada empresa**

In [16]:
lista = []

In [17]:

for i in range(0, df.shape[0]):
    url = "https://startupshub.catalonia.com/" + df.loc[i, "URL"]
    print(f"Processant [{i}]: {url}")

    try:
        # Inicialitzar driver i accedir a la pàgina
        driver = webdriver.Chrome()
        driver.get(url)
        time.sleep(2)  # petita pausa per permetre càrrega

        # Petició per fer servir BeautifulSoup
        resp = requests.get(url, verify=False)
        soup = BeautifulSoup(resp.text, "html.parser")

        # Comprovació de l'adreça
        address_element = soup.find('strong', class_='company-name')
        if not address_element or not address_element.find_next_sibling(string=True):
            print(f"No hi ha adreça. Es salta.")
            continue
        address = address_element.find_next_sibling(string=True).strip()

        # Dades estàtiques via BeautifulSoup
        name = soup.find('h1', class_='big_title').text.strip() if soup.find('h1', class_='big_title') else None
        description = soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else None
        founded = soup.find('span', class_='founded').find('strong').text.strip() if soup.find('span', class_='founded') else None
        employees = soup.find('span', class_='employers').find('strong').text.strip() if soup.find('span', class_='employers') else None

        # Camps opcionals amb control
        def safe_text(selector, label=''):
            element = soup.find('span', class_=selector)
            return element.text.replace(label, '').strip() if element else None

        industries = safe_text('industries information-item', 'Industries:')
        technologies = safe_text('technologies information-item', 'Technologies:')
        others_fields = safe_text('otherFields information-item', 'Other fields:')
        financial_founded = safe_text('founded information-item', 'Founded:')
        business_model = safe_text('model information-item', 'Business model:')
        targets = safe_text('target information-item', 'Target:')
        spinoff = safe_text('spinoffs information-item', 'Spinoff participants:')

        # Rondas de finançament
        funding_info = []
        for funding in soup.find_all('div', class_='item punts-servei-content row-fluid'):
            amount = funding.find('h3').text.strip()
            date = amount.split('(')[-1].replace(')', '')
            funding_info.append({'amount': amount, 'date': date})

        # Elements dinàmics amb Selenium + WebDriverWait
        try:
            financial_funding_stage = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "fundingStageTxt"))
            ).text
        except TimeoutException:
            print("No s'ha trobat fundingStageTxt")
            financial_funding_stage = None

        try:
            financial_employees = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "employeesTxt"))
            ).text
        except TimeoutException:
            print("No s'ha trobat employeesTxt")
            financial_employees = None

        try:
            founding = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, "generic-container"))
            )[0].text
        except TimeoutException:
            print("No s'ha trobat generic-container")
            founding = None

        # Guardar dades
        data = {
            'Name': [name],
            'Description': [description],
            'Address': [address],
            'Industries': [industries],
            'Technologies': [technologies],
            'Other fields': [others_fields],
            'Funding stage': [financial_funding_stage],
            'Founded': [financial_founded],
            'Employees': [financial_employees],
            'Business model': [business_model],
            'Target': [targets],
            'Spinoff participants': [spinoff],
            'Funding': [funding_info],
            'Funding2': [founding]
        }

        lista.append(data)

    except Exception as e:
        print(f"Error inesperat a [{i}]: {e}")

    finally:
        driver.quit()


Processant [0]: https://startupshub.catalonia.com/startup/barcelona/payfit/5048
Processant [1]: https://startupshub.catalonia.com/startup/barcelona/seedtag/5926
Processant [2]: https://startupshub.catalonia.com/startup/barcelona/paack/1514
Processant [3]: https://startupshub.catalonia.com/startup/barcelona/impress/5340
Processant [4]: https://startupshub.catalonia.com/startup/barcelona/celestia-aerospace/5752
Processant [5]: https://startupshub.catalonia.com/startup/barcelona/starkfuture/6603
Processant [6]: https://startupshub.catalonia.com/startup/barcelona/cryptosnacks/6599
Processant [7]: https://startupshub.catalonia.com/startup/barcelona/splicebio/1913
No hi ha adreça. Es salta.
Processant [8]: https://startupshub.catalonia.com/startup/barcelona/yaba/5565
Processant [9]: https://startupshub.catalonia.com/startup/barcelona/housfy/4181
Processant [10]: https://startupshub.catalonia.com/startup/barcelona/submer/1443
Processant [11]: https://startupshub.catalonia.com/startup/barcelon

In [None]:
df_info = pd.DataFrame(lista)
print(df_info)

In [None]:
print(df_info.columns)
df_info.Funding

In [25]:
df_info = pd.DataFrame(lista)
df_info["Name"] = [d[0] for d in df_info["Name"]]
df_info["Description"] = [d[0] for d in df_info["Description"]]
df_info["Address"] = [d[0] for d in df_info["Address"]]
df_info["Industries"] = [d[0] for d in df_info["Industries"]]
df_info["Technologies"] = [d[0] for d in df_info["Technologies"]]
df_info["Other fields"] = [d[0] for d in df_info["Other fields"]]
df_info["Funding stage"] = [d[0] for d in df_info["Funding stage"]]
df_info["Founded"] = [d[0] for d in df_info["Founded"]]
df_info["Employees"] = [d[0] for d in df_info["Employees"]]
df_info["Business model"] = [d[0] for d in df_info["Business model"]]
df_info["Target"] = [d[0] for d in df_info["Target"]]
df_info["Spinoff participants"] = [d[0] for d in df_info["Spinoff participants"]]

In [26]:
df_info = df_info.drop_duplicates(subset=['Name'], keep='first')

In [28]:
datosFinal = df.merge(df_info, left_on='Company', right_on='Name')

In [30]:
print(df_info)

                                             Name  \
0                    PAYFIT RECURSOS HUMANOS S.L.   
1                         SEEDTAG ADVERTISING SL.   
2                              PAACK LOGISTICS SL   
3                                SMILE2IMPRESS SL   
4                          CELESTIA AEROSPACE SL.   
..                                            ...   
441  LIMITLESS TECHNOLOGIES AND APPLICATIONS S.L.   
442                                VOILAAA BIO SL   
443                             TIRECAT HEALTH SL   
444                     COPERNIC TECHNOLOGIES, SL   
445                         OMNISCOPE ESPAÑA, SL.   

                                           Description  \
0                                      Human resources   
1    Seedtag is the leading contextual advertising ...   
2    Paack is a cutting-edge last-mile delivery com...   
3    Experts in invisible orthodontia. impress was ...   
4    Celestia aerospace is born out of the understa...   
..             

In [89]:
conteoRegistros = []
capitales = []
investors = []

for j in range(datosFinal.shape[0]):
    lista = datosFinal.loc[j, "Funding2"][0].split("\n")
    capital_prev_values = [lista[i - 1] for i, val in enumerate(lista) if val.lower() == 'capital' and i > 0]
    investors_cleaned = [val.replace('Investors:', '').strip() for val in lista if 'Investors' in val]
    conteoRegistros.append(len(capital_prev_values))
    capitales.append(capital_prev_values)
    investors.append(investors_cleaned)

capitales = [item for sublista in capitales for item in sublista]
investors = [item for sublista in investors for item in sublista]

In [90]:
df_repetido = datosFinal.loc[datosFinal.index.repeat(conteoRegistros)].reset_index(drop=True)
df_repetido["capital_prev"] = capitales
df_repetido["investors"] = investors

In [94]:
datosFinal = df_repetido.drop(columns=['Funding', 'Funding2'])

In [None]:
datosFinal.to_csv('datosFinal.csv', index=False)
datosFinal.to_pickle("./datosFinal.pkl") 