In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from transformers import BertTokenizer, BertModel
import pickle

import spacy
import pandas as pd

In [18]:
# 2. Carreguem dades
DF = pd.read_pickle("data/datos_preprocesados.pkl")

Creem un dataframe amb el nom i la descripció

In [19]:
df = DF[["Name","Description"]]
df = df.drop_duplicates(subset='Name', keep='first').reset_index(drop=True)
DF = DF.drop(columns=["Description"])
#print(df)

TF-idf

In [20]:
# Crear el vectoritzador
vectorizer = TfidfVectorizer()

# Aplicar TF-IDF sobre la columna 'Description'
tfidf_matrix = vectorizer.fit_transform(df['Description'])

# Convertir la matriu en un DataFrame per veure-ho clar
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Afegir l'ID o index per context
tfidf_df['Name'] = df['Name']

In [21]:
#tfidf_df

In [22]:
DF_tfidf = DF.merge(tfidf_df, on="Name", how="left")

Word2Vec

In [23]:
# Carrega el model amb embeddings
nlp = spacy.load("en_core_web_md")

# Exemple: df = pd.DataFrame({'Name': [...], 'Description': [...]})

# Funció per calcular vector mitjà de la frase
def sentence_vector(text):
    doc = nlp(text)
    return doc.vector

# Aplicar a cada descripció
sentence_vecs = df['Description'].astype(str).apply(sentence_vector)
sentence_vecs_df = pd.DataFrame(sentence_vecs.tolist())
sentence_vecs_df.insert(0, 'Name', df['Name'])

In [24]:
#sentence_vecs_df

In [25]:
DF_W2V = DF.merge(sentence_vecs_df, on="Name", how="left")

Transformers Bert

In [26]:
# Suposem que tens el DataFrame df amb 'Description' i 'Name'
descriptions = df['Description'].astype(str).tolist()

# Carregar model i tokenizer BERT (base uncased, pots canviar a model en català si vols)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Funció per obtenir l'embedding d'una frase (mitjana dels tokens)
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # outputs.last_hidden_state té la representació per cada token
    # Fem la mitjana sobre els tokens per tenir un vector fix (1 x 768)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.numpy()

# Generar embeddings per tots els textos (pot ser lent si és gran)
embeddings = [get_embedding(text) for text in descriptions]

# Convertir a DataFrame, 768 columnes per BERT base
embeddings_df = pd.DataFrame(embeddings)

# Afegir la columna Name
embeddings_df.insert(0, 'Name', df['Name'].values)

In [27]:
#embeddings_df

In [28]:
DF_TBert = DF.merge(embeddings_df, on="Name", how="left")


In [29]:
DF_tfidf

Unnamed: 0,Name,Startup,Investment,Num_Inversio,Founded,es_sociedad_limitada,Ind Business Services & Software,Ind Healthtech,Ind Financing,Ind E-commerce & Marketplaces,...,zen,zero,zerod,zones,zoundream,zymvol,àmbit,és,ús,útils
0,TRAVELPERK SL,TRAVELPERK,190.0,8,10.0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0
1,TRAVELPERK SL,TRAVELPERK,95.0,7,10.0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0
2,TRAVELPERK SL,TRAVELPERK,241.9,6,10.0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0
3,TRAVELPERK SL,TRAVELPERK,132.0,5,10.0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0
4,TRAVELPERK SL,TRAVELPERK,53.0,4,10.0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,ZOUNDREAM S.L.,ZOUNDREAM,0.0,2,6.0,1,1,1,0,0,...,0.0,0.0,0.0,0.0,0.14802,0.0,0.0,0.0,0.0,0.0
619,ZOUNDREAM S.L.,ZOUNDREAM,0.0,1,6.0,1,1,1,0,0,...,0.0,0.0,0.0,0.0,0.14802,0.0,0.0,0.0,0.0,0.0
620,SILT DIGITAL ID SL,SILT,0.0,1,5.0,1,1,0,0,0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0
621,"CONSTRUMARKET DIGITAL, S.L.",CONKAU,0.0,1,2.0,1,1,0,0,0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0


In [30]:
DF_W2V

Unnamed: 0,Name,Startup,Investment,Num_Inversio,Founded,es_sociedad_limitada,Ind Business Services & Software,Ind Healthtech,Ind Financing,Ind E-commerce & Marketplaces,...,290,291,292,293,294,295,296,297,298,299
0,TRAVELPERK SL,TRAVELPERK,190.0,8,10.0,1,0,0,0,0,...,-0.280291,0.126264,-0.047731,0.020952,-0.034946,0.031820,0.005075,-0.107811,0.073892,0.019900
1,TRAVELPERK SL,TRAVELPERK,95.0,7,10.0,1,0,0,0,0,...,-0.280291,0.126264,-0.047731,0.020952,-0.034946,0.031820,0.005075,-0.107811,0.073892,0.019900
2,TRAVELPERK SL,TRAVELPERK,241.9,6,10.0,1,0,0,0,0,...,-0.280291,0.126264,-0.047731,0.020952,-0.034946,0.031820,0.005075,-0.107811,0.073892,0.019900
3,TRAVELPERK SL,TRAVELPERK,132.0,5,10.0,1,0,0,0,0,...,-0.280291,0.126264,-0.047731,0.020952,-0.034946,0.031820,0.005075,-0.107811,0.073892,0.019900
4,TRAVELPERK SL,TRAVELPERK,53.0,4,10.0,1,0,0,0,0,...,-0.280291,0.126264,-0.047731,0.020952,-0.034946,0.031820,0.005075,-0.107811,0.073892,0.019900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,ZOUNDREAM S.L.,ZOUNDREAM,0.0,2,6.0,1,1,1,0,0,...,-0.189731,0.098998,-0.084314,-0.002953,0.054943,0.135561,-0.009826,-0.118422,0.016086,-0.007513
619,ZOUNDREAM S.L.,ZOUNDREAM,0.0,1,6.0,1,1,1,0,0,...,-0.189731,0.098998,-0.084314,-0.002953,0.054943,0.135561,-0.009826,-0.118422,0.016086,-0.007513
620,SILT DIGITAL ID SL,SILT,0.0,1,5.0,1,1,0,0,0,...,-0.283296,-0.023505,-0.003614,0.085212,0.129370,-0.147794,0.006421,0.080255,-0.102202,0.334644
621,"CONSTRUMARKET DIGITAL, S.L.",CONKAU,0.0,1,2.0,1,1,0,0,0,...,-0.449350,0.111507,0.035345,-0.008242,-0.194957,-0.024241,-0.036598,-0.140943,0.106802,0.044366


In [None]:
DF_TBert

In [32]:
DF_tfidf.to_pickle(path="data/dades_TF-idf.pkl")
DF_W2V.to_pickle(path="data/dades_W2V.pkl")
DF_TBert.to_pickle(path="data/dades_TBert.pkl")

ACM

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import prince

# Suposem que tens el DataFrame df amb 'Description' i 'Name'

# 1. Vectoritzar les frases en forma binària (presència/absència)
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(df['Description'].astype(str))

# Convertir a DataFrame per comoditat
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# 2. Aplicar ACM amb prince
mca = prince.MCA(n_components=10, n_iter=3, copy=True, check_input=True, engine='sklearn', random_state=42)
mca = mca.fit(X_df)

# 3. Coordenades de les paraules (columnes)
word_coords = mca.column_coordinates(X_df)

# 4. Funció per obtenir el vector de frase com mitjana vectors paraules
def sentence_vector(sentence):
    words = sentence.lower().split()
    valid_words = [w for w in words if w in word_coords.index]
    if not valid_words:
        return [0]*word_coords.shape[1]
    return word_coords.loc[valid_words].mean(axis=0).values

# 5. Aplicar a totes les frases
sentence_vectors = df['Description'].astype(str).apply(sentence_vector).tolist()

# 6. Crear DataFrame amb resultats
sentence_vectors_df = pd.DataFrame(sentence_vectors)
sentence_vectors_df.insert(0, 'Name', df['Name'].values)

In [None]:
sentence_vectors_df