In [31]:
import hsfs

from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import split
from pyspark.ml.functions import vector_to_array

connection = hsfs.connection()
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [7]:
books_raw = fs.get_feature_group("books_raw", version=1) \
              .select(['isbn', 'book_title']) \
              .read()

In [15]:
book_title_w2c_input = books_raw.withColumn('book_title', split('book_title', ' '))

In [18]:
word2Vec = Word2Vec(vectorSize=100, maxIter=20, inputCol="book_title", outputCol="model")

In [19]:
model = word2Vec.fit(book_title_w2c_input)

In [20]:
model.getVectors().show()

+----------+--------------------+
|      word|              vector|
+----------+--------------------+
|  Unspoken|[-0.3368006348609...|
| (Precious|[0.48832708597183...|
|    Talent|[0.09526077657938...|
| Hourglass|[0.16151781380176...|
|   Priests|[-0.1106925979256...|
|     Aloft|[0.17404748499393...|
| Religion:|[-0.3366450071334...|
| Good-Bye,|[-0.4512170255184...|
|  Messiahs|[0.00796517822891...|
|  Trouble,|[-0.2906562387943...|
|   Affaire|[-0.3789294064044...|
|    Corps,|[-0.1641155779361...|
|      Vile|[-0.6809298396110...|
|   Heaven)|[0.20334143936634...|
|      9/11|[0.36479723453521...|
|     Ages,|[-0.1385688781738...|
|   Motives|[0.12206297367811...|
| Thriller)|[-0.4521277248859...|
|    Fierce|[-0.8244647979736...|
|Delectable|[-0.1563138663768...|
+----------+--------------------+
only showing top 20 rows

In [21]:
title_embedding = model.transform(book_title_w2c_input)

In [32]:
title_embedding = title_embedding.withColumnRenamed('model', 'title_embeddings')\
                                  .withColumn('title_embeddings', vector_to_array('title_embeddings')) \
                                 .drop('book_title')

In [26]:
title_embedding.show(10)

+----------+--------------------+
|      isbn|    title_embeddings|
+----------+--------------------+
|0962962058|[0.02132615759037...|
|0962962880|[-0.1588221978810...|
|0962962899|[0.01749602332711...|
|0962964506|[-0.0355386535326...|
|0962968307|[0.10151556879281...|
|0962975419|[-0.0744516259680...|
|0962976806|[0.03971425609456...|
|0962976830|[0.03971425609456...|
|0962980307|[-0.0936847247183...|
|0962980323|[-0.0688666626811...|
+----------+--------------------+
only showing top 10 rows

In [33]:
extra_hudi_options = {
    "hoodie.bulkinsert.shuffle.parallelism":"5",
    "hoodie.insert.shuffle.parallelism":"5", 
    "hoodie.upsert.shuffle.parallelism":"5",
    "hoodie.parquet.compression.ratio":"0.5"
} 

In [34]:
title_embeddings_meta = fs.create_feature_group("title_embeddings",
                                version=1,
                                description="Embeddings for book titles",
                                online_enabled=False,
                                primary_key=["isbn"],
                                statistics_config=False)

title_embeddings_meta.save(title_embedding, extra_hudi_options)