In [1]:
import re
import pandas as pd
import numpy as np

# Paquetes llamadas 
import time

# Scrapping 
import requests

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### **Extracción de empresas**

In [2]:
data = []

for pa in range(1, 5):
    resp = requests.post("https://startupshub.catalonia.com/investments-in-catalan-startups?pageNumber=4&year=0", verify=False)
    soup = BeautifulSoup(resp.text, "html.parser")
    
    # Buscar todos los elementos con la clase 'items'
    for item in soup.find_all("div", class_="items"):
        # Extraer la URL desde el atributo `url` en la etiqueta <a>
        url_element = item.find("a", attrs={"url": True})
        url = url_element["url"] if url_element else None

        # Extraer información
        startup_name = item.find("h4").text.strip() if item.find("h4") else None
        company_name = item.find("div", class_="item-text").find_all("p")[0].text.strip()
        category = item.find("div", class_="item-text").find_all("p")[1].text.strip()
        investment_amount = item.find("div", class_="col-md-4").find("p").text.strip()
        
        investors_text = item.find("div", class_="col-md-4").find("strong")
        investors = investors_text.find_next_sibling(string=True).strip() if investors_text else None
        
        date = item.find("p", class_="date").text.strip() if item.find("p", class_="date") else None

        # Agregar datos a la lista
        data.append([startup_name, company_name, category, investment_amount, investors, date, url])

# Crear DataFrame
df = pd.DataFrame(data, columns=["Startup", "Company", "Category", "Investment", "Investors", "Date", "URL"])



### **Extracción de info de cada empresa**

In [3]:
lista = []

In [4]:
for i in range(0, df.shape[0]):
    # Configurar el driver (asegúrate de poner la ruta correcta a tu WebDriver)
    driver = webdriver.Chrome()  # Usa webdriver.Firefox() si usas Firefox

    # Nos quedamos con la url 
    url = "https://startupshub.catalonia.com/" + df.loc[i, "URL"]

    # Extraemos la información de la URL
    resp = requests.get(url, verify=False)
    soup = BeautifulSoup(resp.text, "html.parser")

    # Extraer información relevante
    name = soup.find('h1', class_='big_title').text.strip()
    description = soup.find('meta', {'name': 'description'})['content']
    address = soup.find('strong', class_='company-name').find_next_sibling(string=True).strip()
    founded = soup.find('span', class_='founded').find('strong').text.strip()
    employees = soup.find('span', class_='employers').find('strong').text.strip()
    industries = soup.find('span', class_='industries information-item').text.replace('Industries:', '').strip()
    technologies = soup.find('span', class_='technologies information-item').text.replace('Technologies:', '').strip()
    others_fields = soup.find('span', class_='otherFields information-item').text.replace('Other fields:', '').strip()
    financial_founded = soup.find('span', class_='founded information-item').text.replace('Founded:', '').strip()
    business_model = soup.find('span', class_='model information-item').text.replace('Business model:', '').strip()
    targets = soup.find('span', class_='target information-item').text.replace('Target:', '').strip()
    spinoff = soup.find('span', class_='spinoffs information-item').text.replace('Spinoff participants:', '').strip()
    funding_info = []
    for funding in soup.find_all('div', class_='item punts-servei-content row-fluid'):
        amount = funding.find('h3').text.strip()
        date = funding.find('h3').text.strip().split('(')[-1].replace(')', '')
        # investors = funding.find('p', text=lambda t: t and 'Investors:' in t).text.replace('Investors:', '').strip()
        # funding_info.append({'amount': amount, 'date': date, 'investors': investors})
        funding_info.append({'amount': amount, 'date': date})

    # Cargar la página
    time.sleep(10)
    driver.get(url)
    
    financial_funding_stage = soup.find('span', class_='stage information-item').text.replace('Funding stage:', '').strip()
    financial_employees = soup.find('span', class_='employers information-item').text.replace('Employees:', '').strip()

    financial_funding_stage = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "fundingStageTxt"))).text
    financial_employees = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "employeesTxt"))).text

    founding = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "generic-container")))[0].text

    # Cerrar el navegador
    driver.quit()

    data = {
            'Name': [name],
            'Description': [description],
            'Address': [address],
            'Industries': [industries],
            'Technologies': [technologies],
            'Other fields': [others_fields],
            'Funding stage': [financial_funding_stage],
            'Founded': [financial_founded],
            'Employees': [financial_employees],
            'Business model': [business_model],
            'Target': [targets],
            'Spinoff participants': [spinoff],
            'Funding': [funding_info], 
            'Funding2': [founding]
        }
    
    lista.append(data)



In [None]:
df_info = pd.DataFrame(lista)

Unnamed: 0,Name,Description,Address,Industries,Technologies,Other fields,Funding stage,Founded,Employees,Business model,Target,Spinoff participants,Funding,Funding2
0,[EQUITO APP S.L],[A real estate investment platform enabling us...,[Paseig De Gracia 19 08007 Barcelona],"[Proptech, \r\n\t\t \t\t\r\n\t\...","[Fintech & Insurtech, \r\n\t\t ...",[Social Economy],[Seed (0.25 M EUR - < 1 M EUR)],[2021],[From 11 to 20],[Marketplace],[Consumer],[None / Not a spinoff],"[[{'amount': '0.3M € (June 2024)', 'date': 'Ju...",[Funding\n0.3M € (JUNE 2024)\nCapital\nInvesto...
1,[VORA],"[Vora is inspired by the city of Barcelona, pr...","[Calle Cavallers, 54 P. 4 Pta. 1 08034 Barcelona]",[Fashion & Design],[E-commerce],[Social Economy],[Seed (0.25 M EUR - < 1 M EUR)],[2022],[Undisclosed],[-],[Consumer],[None / Not a spinoff],"[[{'amount': '0.3M € (June 2024)', 'date': 'Ju...",[Funding\n0.3M € (JUNE 2024)\nCapital\nInvesto...
2,[SEEDFY ECOSYSTEM S.L.],[LexDoka is a collaborative platform that turn...,"[Calle Pallars, 108 08018 Barcelona]","[Business Services & Software, \r\n\t\t ...","[Fintech & Insurtech, \r\n\t\t ...",[Industry 4.0],[Seed (0.25 M EUR - < 1 M EUR)],[2021],[Undisclosed],[Saas],[Business],[None / Not a spinoff],"[[{'amount': '0.3M € (May 2024)', 'date': 'May...",[Funding\n0.3M € (MAY 2024)\nCapital\nInvestor...
3,[KOKUAI S.L.],[Kokuai is a Barcelona based start-up founded ...,"[Carrer Llacuna, 162 08018 Barcelona]","[Healthtech, \r\n\t\t \t\t\r\n\...","[AI & Big Data, \r\n\t\t \t\t\r...",[-],[Seed (0.25 M EUR - < 1 M EUR)],[2020],[From 1 to 5],[Saas],[Business],[None / Not a spinoff],"[[{'amount': '0.3M € (September 2024)', 'date'...",[Funding\n0.3M € (SEPTEMBER 2024)\nCapital\nIn...
4,[FLAMAID S.L.],[Flamaid is a safety device equipped with an a...,"[Calle Salvador Espriu, 47 P. 5 Pta. 1 08005 B...",[Hardware],[IoT & Sensors],[Social Economy],[Seed (0.25 M EUR - < 1 M EUR)],[2023],[Undisclosed],[Development & Manufacturing],[Consumer],[None / Not a spinoff],"[[{'amount': '0.1M € (November 2024)', 'date':...",[Funding\n0.1M € (NOVEMBER 2024)\nCapital\nInv...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,[IMPERFECTUS BOX S.L],[Talkual is a delivery service for imperfect s...,"[Avinguda Lleida, 32 25250,0 Bellpuig]","[E-commerce & Marketplaces, \r\n\t\t ...","[E-commerce, \r\n\t\t \t\t\r\n\...",[Circular Economy],[Pre-seed (< 0.25 M EUR)],[2019],[From 21 to 50],"[Ecommerce & Trade, \nMarketplace, \nSaas, \nS...","[Business, \nConsumer]",[Company /Other Centre],"[[{'amount': '0.1M € (June 2024)', 'date': 'Ju...",[Funding\n0.1M € (JUNE 2024)\nCapital\nInvesto...
64,[PLACE4PLAN S.L.],[Platform for renting private pools by the hour.],[Urbanizacion Cabrils Ii (crta Cabrils A Vilas...,"[Traveltech & Leisure, \r\n\t\t ...","[AI & Big Data, \r\n\t\t \t\t\r...",[Sharing Economy],[Pre-seed (< 0.25 M EUR)],[2022],[Undisclosed],[Marketplace],[Consumer],[None / Not a spinoff],"[[{'amount': '0.1M € (June 2024)', 'date': 'Ju...",[Funding\n0.1M € (JUNE 2024)\nCapital\nInvesto...
65,[APPASSIONE S.L.],[Online food ordering platform connecting arti...,[Carrer De La Rosella 17 08035 Barcelona],"[Food, \r\n\t\t \t\t\r\n\t\t ...","[E-commerce, \r\n\t\t \t\t\r\n\...",[Smart Cities],[Pre-seed (< 0.25 M EUR)],[2023],[From 1 to 5],[Ecommerce & Trade],"[Business, \nConsumer]",[None / Not a spinoff],"[[{'amount': '0.0M € (June 2024)', 'date': 'Ju...",[Funding\n0.0M € (JUNE 2024)\nCapital\nInvesto...
66,[CID BRAND SL],[We inspire girls to take the world by storm.t...,"[Pasaje Marimon, 23 Loc 08021 Barcelona]",[Fashion & Design],[E-commerce],[-],[Seed (0.25 M EUR - < 1 M EUR)],[2016],[From 6 to 10],[Saas],[-],[None / Not a spinoff],"[[{'amount': '(October 2024)', 'date': 'Octobe...",[Funding\n(OCTOBER 2024)\nCapital\nInvestors: ...


In [6]:
df_info = pd.DataFrame(lista)
df_info["Name"] = [d[0] for d in df_info["Name"]]
df_info["Description"] = [d[0] for d in df_info["Description"]]
df_info["Address"] = [d[0] for d in df_info["Address"]]
df_info["Industries"] = [d[0] for d in df_info["Industries"]]
df_info["Technologies"] = [d[0] for d in df_info["Technologies"]]
df_info["Other fields"] = [d[0] for d in df_info["Other fields"]]
df_info["Funding stage"] = [d[0] for d in df_info["Funding stage"]]
df_info["Founded"] = [d[0] for d in df_info["Founded"]]
df_info["Employees"] = [d[0] for d in df_info["Employees"]]
df_info["Business model"] = [d[0] for d in df_info["Business model"]]
df_info["Target"] = [d[0] for d in df_info["Target"]]
df_info["Spinoff participants"] = [d[0] for d in df_info["Spinoff participants"]]

In [7]:
df_info = df_info.drop_duplicates(subset=['Name'], keep='first')

In [88]:
datosFinal = df.merge(df_info, left_on='Company', right_on='Name')

In [89]:
conteoRegistros = []
capitales = []
investors = []

for j in range(datosFinal.shape[0]):
    lista = datosFinal.loc[j, "Funding2"][0].split("\n")
    capital_prev_values = [lista[i - 1] for i, val in enumerate(lista) if val.lower() == 'capital' and i > 0]
    investors_cleaned = [val.replace('Investors:', '').strip() for val in lista if 'Investors' in val]
    conteoRegistros.append(len(capital_prev_values))
    capitales.append(capital_prev_values)
    investors.append(investors_cleaned)

capitales = [item for sublista in capitales for item in sublista]
investors = [item for sublista in investors for item in sublista]

In [90]:
df_repetido = datosFinal.loc[datosFinal.index.repeat(conteoRegistros)].reset_index(drop=True)
df_repetido["capital_prev"] = capitales
df_repetido["investors"] = investors

In [94]:
datosFinal = df_repetido.drop(columns=['Funding', 'Funding2'])

In [None]:
datosFinal.to_csv('datosFinal.csv', index=False)
datosFinal.to_pickle("./datosFinal.pkl") 