# Embeddings pre-entrenados

Obtenidos de <url>https://github.com/dccuchile/spanish-word-embeddings</utl>

### Descarga

In [24]:
!curl -O https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.es.vec

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2474M  100 2474M    0     0  20.5M      0  0:02:00  0:02:00 --:--:-- 21.7M


In [75]:
### Carga de los vectores en formato word2vec

In [44]:
from gensim.models.keyedvectors import KeyedVectors
import plotly.graph_objects as go
from sklearn.decomposition import PCA
import numpy as np
import plotly.express as px
import pandas as pd

In [39]:
wordvectors_file_vec = 'wiki.es.vec'
cantidad = 1000
wordvectors = KeyedVectors.load_word2vec_format(wordvectors_file_vec, limit=cantidad)

### Generar matriz de vectores

In [43]:
vecs = []

for w in list(wordvectors.vocab.keys()):
  vecs.append(wordvectors.word_vec(w))

vecs = np.array(vecs)
print(vecs)

[[-0.13075   -0.087659  -0.11427   ... -0.040476  -0.012293   0.042569 ]
 [-0.36446    0.095962  -0.16188   ... -0.14986    0.23584    0.18541  ]
 [-0.05911   -0.083343  -0.093019  ... -0.054064   0.17285    0.16713  ]
 ...
 [ 0.06106   -0.26362   -0.031633  ... -0.15016    0.031026  -0.22508  ]
 [-0.0599     0.17048    0.064899  ... -0.29796   -0.0057715 -0.28856  ]
 [ 0.074406  -0.23172   -0.29461   ...  0.13238    0.12954   -0.087849 ]]


### Reducción de dimensionalidad con PCA

In [45]:
pca = PCA(n_components=3)
X = pca.fit_transform(vecs)
print(X)

[[-0.5649125   0.07263261  0.14641635]
 [ 0.3668897  -0.3649068   0.3264169 ]
 [-0.5278418   0.0710632  -0.17836498]
 ...
 [-0.39774808 -1.0439283  -0.17049581]
 [-0.40417057  0.5751186  -0.5025852 ]
 [-0.1831034  -0.8537696   0.48927182]]


### Normalización (opcional)

In [72]:
norms = (X.T/np.array([np.linalg.norm(x) for x in X])).T
print(norms)

[[-0.960603    0.12350781  0.24897306]
 [ 0.5996779  -0.5964369   0.5335255 ]
 [-0.93976045  0.1265197  -0.31755793]
 ...
 [-0.35196763 -0.92377305 -0.15087189]
 [-0.46772388  0.66555244 -0.5816136 ]
 [-0.18293552 -0.8529868   0.48882324]]


### Visualización

In [74]:
df = pd.DataFrame(data=norms, columns=['x','y','z'])
df['word'] = list(wordvectors.vocab.keys())

fig = px.scatter_3d(df[:300], x="x", y="y",z="z", text="word")
fig.update_traces(marker=dict(size=1), textposition='top center')
fig.show()

<class 'pandas.core.frame.DataFrame'>
