In [56]:
import pandas as pd
import umap
from sklearn.decomposition import PCA
import plotly.express as px
import numpy as np

In [2]:
raw_arcom_df = pd.read_csv("/Users/thibaultrolland/Projects/arcom/data/ARCOM.csv")

  raw_arcom_df = pd.read_csv("/Users/thibaultrolland/Projects/arcom/data/ARCOM.csv")


In [49]:
arcom_df = raw_arcom_df

In [4]:
def clean_themes(themes):
    if type(themes) == str:
        themes = themes.split(',')
        themes = [theme.strip() for theme in themes]
        themes = [theme.lower() for theme in themes]
        return themes
    return []

documents = raw_arcom_df['Thème'].apply(clean_themes)

In [5]:
vocabulary = set()
for doc in documents:
    vocabulary.update(doc)
vocabulary = sorted(vocabulary)  # Optional: sort to have a consistent order
word_to_index = {word: idx for idx, word in enumerate(vocabulary)}

In [6]:
def one_hot_encode(word, word_to_index, vocab_size):
    one_hot_vector = np.zeros(vocab_size)
    one_hot_vector[word_to_index[word]] = 1
    return one_hot_vector

vocab_size = len(vocabulary)
one_hot_vectors = {word: one_hot_encode(word, word_to_index, vocab_size) for word in vocabulary}


In [14]:
vocab_size

907

In [32]:
onehot_documents = [
        np.sum([
            one_hot_vectors[word] for word in document
            ], axis=0)
    for document in documents]

# PCA

In [45]:

pca = PCA(n_components=2)
pca_embeddings = pca.fit_transform(np.transpose(onehot_documents))

In [46]:
pca_df = pd.DataFrame(pca_embeddings, columns=['PCA1', 'PCA2'])
pca_df['word'] = vocabulary

# Plotting the PCA embeddings using Plotly
fig = px.scatter(pca_df, x='PCA1', y='PCA2', hover_name='word')
fig.update_traces(textposition='top center')

explained_variance = pca.explained_variance_ratio_

fig.update_layout(
    title='PCA projection of Word2Vec embeddings',
    xaxis_title=f'PCA 1 ( {explained_variance[0]*100:.2f}% )',
    yaxis_title=f'PCA 2 ( {explained_variance[1]*100:.2f}% )',
    xaxis=dict(scaleanchor='y', scaleratio=1),
    yaxis=dict(scaleanchor='x', scaleratio=1),
    showlegend=False
)

fig.show()

# UMAP

In [89]:
umap_model = umap.UMAP(n_neighbors=50,
                        min_dist=.0,
                        n_components=2,
                        metric='l1')

emb = umap_model.fit_transform(onehot_documents)


Graph is not fully connected, spectral embedding may not work as expected.



In [90]:
arcom_df['UMAP1'] = emb[:,0]
arcom_df['UMAP2'] = emb[:,1]

In [91]:
fig = px.scatter(arcom_df, x='UMAP1', y='UMAP2', hover_data=['Produit', 'Marque'], color='Secteur')
fig.update_traces(textposition='top center')

fig.update_layout(
    title='UMAP visualization of Word2Vec embeddings',
    xaxis_title=f'UMAP dimension 1',
    yaxis_title=f'UMAP dimension 2',
    xaxis=dict(scaleanchor='y', scaleratio=1),
    yaxis=dict(scaleanchor='x', scaleratio=1),
    showlegend=False
)

fig.update_traces(marker=dict(size=2),
                  selector=dict(mode='markers'))

fig.update_layout(width=1000, height=1000)
fig.show()