In [313]:
import pandas as pd
import numpy as np
import pickle
import re

In [314]:
# 2. Carreguem dades
df = pd.read_csv("data/datosFinal.csv")

In [None]:
df

### Preprocessing

1. Company

In [315]:
condicio = (
    df["Company"].str.strip().str.upper().str.endswith("S.L") |
    df["Company"].str.strip().str.upper().str.endswith("SL") |
    df["Company"].str.strip().str.upper().str.endswith("S.L.") |
    df["Company"].str.upper().str.contains("SOCIEDAD LIMITADA")
)

df['es_sociedad_limitada'] = np.where(condicio, 'Sí', 'No')

df.es_sociedad_limitada.value_counts()

es_sociedad_limitada
Sí    544
No     79
Name: count, dtype: int64

2. Names

In [316]:
(df["Company"] == df["Name"]).value_counts()
df = df.drop(columns=["Company"])

3. Investment

In [None]:
df["Investment"] = (
    df["capital_prev"]
    .astype(str)
    .str.extract(r'([\d.,]+M\s?€)')[0]   # agafa només la part numèrica amb M i €
    .str.replace("€", "", regex=False)
    .str.replace("M", "", regex=False)
    .str.replace("m", "", regex=False)
    .str.replace(",", ".", regex=False)       # canvia coma per punt decimal
    .str.replace(" ", "", regex=False)        # elimina espais
)

# 2. Conversió a numèric (float), posant NaN si hi ha errors
df["Investment"] = pd.to_numeric(df["Investment"], errors="coerce")

# 3. Substitueix 0 per NaN
df["Investment"] = df["Investment"].replace(0.0, np.nan)

# per tenir 0 en comptes de nan
df["Investment"] = df["Investment"].fillna(0)


df.Investment

4. Investor

In [318]:
df.Investors = df.investors
df = df.drop(columns=['investors'])

In [112]:
pd.set_option('display.max_colwidth', None)
print(df['Investors'].tolist())


['Atomico, EQT Growth, Kinnevik', 'Softbank, Kinnevik, Felix Capital', 'Kinnevik, General Catalyst Partners, business angels (Gilian Tans, Joel Cutler).', 'Greyhound Capital', 'KINNEVIK, PARTNERS OF DST GLOBAL, TARGET GLOBAL, FELIX CAPITAL, SUNSTONE, LOCALGLOBE', 'KINNEVIK, YURI MILNER, TOM STAFFORD, FELIX CAPITAL, TARGET GLOBAL, SPARK CAPITAL, LOCALGLOBE, SUNSTONE, AMPLO', 'TARGET GLOBAL, FELIX CAPITAL, SPARK CAPITAL, SUNSTONE', 'SPARK CAPITAL PARTNERS LLC, SUNSTONE CAPITAL A/S, LOCALGLOBE LLP', 'General Catalyst', 'Atomico, Creandum, Tiger Global Management, GIC, K Fund, CRV', 'Creandum, Point Nine, Tiger Global Management, K Fund, CRV, Columbia Lake Partners', 'Creandum, Point Nine, K Fund, CRV', 'Creandum, Point Nine, K Fund', 'Hyperion Fund, SETT', 'Global Porfolio Investments', 'n.a.', 'SEPIDES Gestión', 'Columbus Venture Partner, Panakès Partners i Mérieux', 'AXISC, DTI, Alta Life Sciences, Werfen Life Group, UI Investissement, Kurma Partners, Asabys Partners', 'Alta Life Scienc

In [None]:
all_investors = []

for group in df['Investors'].dropna().tolist():
    splitted = [inv.strip() for inv in group.split(',')]
    all_investors.extend(splitted)

investor_series = pd.Series(all_investors)

# Comptar freqüències
freq_table = investor_series.value_counts().reset_index()

# Renombrar columnes
freq_table.columns = ['Investor', 'Frequency']

# Mostrar la taula
print(freq_table)

In [None]:
import matplotlib.pyplot as plt

# Suposem que tens aquesta taula:
# freq_table.columns = ['Investor', 'Frequency']

plt.figure(figsize=(8, 5))
plt.hist(freq_table['Frequency'], bins=range(1, freq_table['Frequency'].max() + 2), edgecolor='black')
plt.title('Histograma de Freqüència dels Inversors')
plt.xlabel('Nombre de vegades que un inversor apareix')
plt.ylabel('Nombre d\'inversors')
plt.xticks(range(1, freq_table['Frequency'].max() + 1))
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [321]:
#De moment no faig la columna investors perquè hi ha molts pocs que es repeteixen i encara haig de pensar com fer per saber si són persones o no

5. Date

In [322]:
df.Date = df['capital_prev'].str.extract(r'\((.*?)\)')

In [323]:
df = df.drop(columns=['capital_prev'])

In [None]:
df

In [None]:
df["Mes Inversion"] = [mes.split(" ")[0] for mes in df["Date"]]
df["Year Inversion"] = [mes.split(" ")[1] for mes in df["Date"]]
df
## Eliminar variable Date

6. Address

Sacamos las coordenadas a partir de la dirección

In [None]:
df.Address

In [119]:
def limpiar_direccion(dir_raw):
    dir_limpia = dir_raw
    
    # Elimina paréntesis y su contenido
    dir_limpia = re.sub(r"\(.*?\)", "", dir_limpia)
    
    # Elimina referencias tipo P., Pta., Planta, Puerta, Loc, Sector, Sobreático, Num, Tienda
    dir_limpia = re.sub(r"\b(P\. |Pta\.?|Planta|Puerta|Loc|Sector|Sobreático|Num|Tienda)\b\.?\s*\d*\w*", "", dir_limpia, flags=re.IGNORECASE)
    
    # Elimina referencias tipo P., Pta., Planta, Puerta, Loc, Sector, Sobreático, Num, Tienda
    dir_limpia = re.sub(r"C/", "Carrer", dir_limpia, flags=re.IGNORECASE)
    dir_limpia = re.sub(r"Urbanizacion", "Urbanizacio", dir_limpia, flags=re.IGNORECASE)
    dir_limpia = re.sub(r"Rbla", "Rambla", dir_limpia, flags=re.IGNORECASE)
    dir_limpia = re.sub(r"Plaza", "Plaça", dir_limpia, flags=re.IGNORECASE)
    dir_limpia = re.sub(r"Pasaje", "Passatge", dir_limpia, flags=re.IGNORECASE)
    dir_limpia = re.sub(r"Paseig", "Passeig", dir_limpia, flags=re.IGNORECASE)
    
    # Elimina códigos postales (5 dígitos)
    dir_limpia = re.sub(r"\b\d{5}\b", "", dir_limpia)
    
    # Elimina comas innecesarias y múltiples espacios
    dir_limpia = re.sub(r",", "", dir_limpia)
    dir_limpia = re.sub(r"\s+", " ", dir_limpia)
    
    # Limpieza final de espacios
    dir_limpia = dir_limpia.strip()
    
    return dir_limpia

In [120]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import time

# Inicializa el geolocalizador
geolocator = Nominatim(user_agent="GetLoc")

# Función segura para geocodificar
def safe_geocode(direccion):
    try:
        location = geolocator.geocode(direccion, timeout=10)
        if location:
            latituds = location.latitude
            longituds = location.longitude
        else:
            print("No se encontró la dirección.")
            latituds = np.nan
            longituds = np.nan
    except GeocoderTimedOut:
        print("Timeout — intentando de nuevo...")
        time.sleep(1)
        return safe_geocode(direccion)
    except Exception as e:
        print("Error:", e)
    return (latituds, longituds)

In [121]:
direcciones = [re.sub("Calle", "Carrer", direc) for direc in df["Address"]]

In [None]:
direcciones_limpias = [limpiar_direccion(d) for d in direcciones]
direcciones_limpias

In [None]:
# Llama a la función con tu dirección
latituds = [safe_geocode(dir)[0] for dir in direcciones]


In [None]:
longituds = [safe_geocode(dir)[1] for dir in direcciones]

In [None]:
direcciones

In [None]:
print(pd.DataFrame({'latituds': latituds,'longituds': longituds}).iloc[151:200])

7. Industries

In [None]:
# Elimina salts de línia, tabulacions i espais extra
df['Industries'] = df['Industries'].astype(str) \
    .str.replace(r'[\r\n\t]', '', regex=True) \
    .str.replace(r'\s{2,}', ' ', regex=True) \
    .str.strip()

In [None]:
# Inicialitzar llista buida
all_industries = []

# Iterar per cada fila
for item in df['Industries'].dropna():
    industries = [i.strip() for i in item.split(',')]
    all_industries.extend(industries)

industries_series = pd.Series(all_industries)
freq_table = industries_series.value_counts().reset_index()
freq_table.columns = ['Industry', 'Frequency']
Industries = freq_table.Industry
print(freq_table.Industry)

#Veiem que només hi ha 24 industries, podem fer columnes binàries per a cada indústria

In [327]:
Ind = df[["Name","Industries"]]
M = pd.DataFrame(columns = [Industries])
for i in range(len(Ind)):
    a = Ind.Industries[i]
    a_clean = [i.strip() for i in a.split(',')]
    binary_vector = [1 if industry in a_clean else 0 for industry in Industries]
    M.loc[i] = binary_vector

In [328]:
df_combinat = pd.concat([Ind, M], axis=1)

In [None]:
#DF = pd.concat([df, M], axis=1)
#df=DF

#En el cas de voler ajuntar les matrius

8. Technologies

In [None]:
# Elimina salts de línia, tabulacions i espais extra
df['Technologies'] = df['Technologies'].astype(str) \
    .str.replace(r'[\r\n\t]', '', regex=True) \
    .str.replace(r'\s{2,}', ' ', regex=True) \
    .str.strip()
df.Technologies

In [None]:
# Inicialitzar llista buida
all_technologies = []

# Iterar per cada fila
for item in df['Technologies'].dropna():
    technologies = [i.strip() for i in item.split(',')]
    all_technologies.extend(technologies)

# Crear una sèrie i taula de freqüències
technologies_series = pd.Series(all_technologies)
tech_freq_table = technologies_series.value_counts().reset_index()
tech_freq_table.columns = ['Technology', 'Frequency']

# Guardem la llista única de tecnologies
Technologies = tech_freq_table.Technology
print(tech_freq_table.Technology)

# Ara ja pots fer columnes binàries per cada tecnologia

In [331]:
Tech = df[["Name","Technologies"]]
T = pd.DataFrame(columns = [Technologies])
for i in range(len(Tech)):
    a = Tech.Technologies[i]
    a_clean = [i.strip() for i in a.split(',')]
    binary_vector = [1 if technology in a_clean else 0 for technology in Technologies]
    T.loc[i] = binary_vector

In [332]:
df_combinat = pd.concat([Tech, T], axis=1)

In [None]:
#DF = pd.concat([df, M], axis=1)
#df=DF

#En el cas de voler ajuntar les matrius

9. Categoria

In [333]:
df = df.drop("Category", axis = 1)

9. Other fields

In [None]:
# Elimina salts de línia, tabulacions i espais extra
df['Other fields'] = df['Other fields'].astype(str) \
    .str.replace(r'[\r\n\t]', '', regex=True) \
    .str.replace(r'\s{2,}', ' ', regex=True) \
    .str.strip()
df["Other fields"]

In [None]:
# Inicialitzar llista buida
all_fields = []

# Iterar per cada fila
for item in df['Other fields'].dropna():
    fields = [i.strip() for i in item.split(',')]
    all_fields.extend(fields)

# Crear una sèrie i taula de freqüències
fields_series = pd.Series(all_fields)
fields_freq_table = fields_series.value_counts().reset_index()
fields_freq_table.columns = ['Field', 'Frequency']

# Guardem la llista única de camps
OtherFields = fields_freq_table.Field
print(fields_freq_table)

In [337]:
OFields = df[["Name","Other fields"]]
OF = pd.DataFrame(columns = [OtherFields])
for i in range(len(OFields)):
    a = OFields["Other fields"][i]
    a_clean = [i.strip() for i in a.split(',')]
    binary_vector = [1 if otherfield in a_clean else 0 for otherfield in OtherFields]
    OF.loc[i] = binary_vector

In [338]:
df_combinat = pd.concat([OFields, OF], axis=1)

In [None]:
#DF = pd.concat([df, OF], axis=1)
#df=DF

#En el cas de voler ajuntar les matrius

10. Funding stage

In [None]:
#No te sentit aquesta variable ja que ja tenim la quantitat exacte
df = df.drop("Funding stage", axis = 1)

11. Founded

In [None]:
df["Founded"] = df["Founded"].astype(float)
df_clean = df.dropna(subset=['Founded'])

plt.figure(figsize=(10, 6))
plt.hist(df_clean['Founded'], bins=20, edgecolor='black')
plt.title('Histograma de l\'any de fundació')
plt.xlabel('Any de fundació')
plt.ylabel('Nombre de startups')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

12. Employees

In [355]:
df.Employees.value_counts()

Employees
From 1 to 5      120
From 6 to 10     114
From 11 to 20    111
From 21 to 50    111
Undisclosed       96
More than 50      71
Name: count, dtype: int64

13. Business model

In [None]:
# Elimina salts de línia, tabulacions i espais extra
df['Business model'] = df['Business model'].astype(str) \
    .str.replace(r'[\r\n\t]', '', regex=True) \
    .str.replace(r'\s{2,}', ' ', regex=True) \
    .str.strip()
df["Business model"]

In [None]:
# Inicialitzar llista buida
all_models = []

# Iterar per cada fila
for item in df['Business model'].dropna():
    models = [i.strip() for i in item.split(',')]
    all_models.extend(models)

# Crear una sèrie i taula de freqüències
models_series = pd.Series(all_models)
models_freq_table = models_series.value_counts().reset_index()
models_freq_table.columns = ['Business model', 'Frequency']

# Guardar la llista única de models
BusinessModels = models_freq_table['Business model']

print(models_freq_table)

In [363]:
BModels = df[["Name","Business model"]]
BM = pd.DataFrame(columns = [BusinessModels])
for i in range(len(BModels)):
    a = BModels["Business model"][i]
    a_clean = [i.strip() for i in a.split(',')]
    binary_vector = [1 if businessmodel in a_clean else 0 for businessmodel in BusinessModels]
    BM.loc[i] = binary_vector

In [None]:
print(models_freq_table)

In [None]:
sums = BM.sum()
print(sums)

In [None]:
df_combinat = pd.concat([BModels, BM], axis=1)

In [None]:
#DF = pd.concat([df, BM], axis=1)
#df=DF

#En el cas de voler ajuntar les matrius

14. Target

In [None]:
df['Target'] = df['Target'].astype(str) \
    .str.replace(r'[\r\n\t]', '', regex=True) \
    .str.replace(r'\s{2,}', ' ', regex=True) \
    .str.strip()
df["Target"]

In [368]:
df["Target"].value_counts()

Target
Business              246
-                     171
Business, Consumer    133
Consumer               73
Name: count, dtype: int64

15. Spinoff participants

In [394]:
z = df["Spinoff participants"].fillna("").astype(str) \
    .str.replace(r'\(.*?\)', '', regex=True) \
    .str.replace(r'\n', '', regex=True) \
    .str.replace(';', ',', regex=False) \
    .str.strip()

In [395]:
z.value_counts()

Spinoff participants
None / Not a spinoff                                                            517
University                                                                       20
CERCA Centre                                                                     14
Technological Centre                                                             11
CERCA Centre , ICREA Centre                                                       9
Company /Other Centre                                                             9
CERCA Centre , University                                                         7
CERCA Centre , ICREA Centre, University , University - Hospital                   5
ICREA Centre , University, Company /Other Centre                                  3
University , University - Hospital                                                3
CERCA Centre , Technological Centre                                               3
ICREA Centre                                           

In [381]:
df["Spinoff participants"].value_counts()

Spinoff participants
None / Not a spinoff                                                                                                                                                                  517
CERCA Centre (CVC)                                                                                                                                                                      5
CERCA Centre (ICN2); \nICREA Centre                                                                                                                                                     5
CERCA Centre (IDIBAPS); \nICREA Centre; \nUniversity (UPF, UPC); \nUniversity - Hospital (Hospital Clínic)                                                                              5
University (Universitat Rovira i Virgili)                                                                                                                                               4
University (UPC)                                 

Para la variable capita_prev contrastar con las variables Investor y Date si son iguales sino se deberá de trabajar para tener 2 variables independientes. 

In [None]:
df = df.drop(columns=["URL", "Date"])

## Guardamos la base de datos 

In [39]:
df.to_pickle(path="data/datos_preprocesados.pkl")
df.to_csv(path_or_buf="data/datos_preprocesados.csv")