In [20]:
import hsfs
import numpy as np
import pandas as pd

from pyspark.sql.functions import col
from sklearn.metrics.pairwise import cosine_similarity

connection = hsfs.connection()
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [2]:
books = fs.get_feature_group('books', version=1)
title_embeddings = fs.get_feature_group('title_embeddings', version=1)

In [19]:
books_df = books.select_except(['book_title', 'publisher']) \
                .join(title_embeddings.select_all()) \
                .read() \
                .limit(10000)

In [21]:
for i in range(0, 100):
    books_df = books_df.withColumn("e{}".format(i), col('title_embeddings').getItem(i))

In [42]:
books_df = books_df.drop('title_embeddings')

In [43]:
books_pdf = books_df.toPandas()



In [44]:
isbn = books_pdf["isbn"]

In [45]:
isbn

0       0006149995
1       0006552447
2       0006645097
3       0026259206
4       0027653668
           ...    
9995    0515119989
9996    0515123854
9997    0515126500
9998    0515130397
9999    0515134023
Name: isbn, Length: 10000, dtype: object

In [46]:
books_pdf.drop('isbn', axis=1, inplace=True)

In [47]:
normalized_df = (books_pdf - books_pdf.mean()) / (books_pdf.std())

In [48]:
normalized_df.dtypes

year_of_publication    float64
author_id              float64
num_reviews            float64
is_popular             float64
avg_rating             float64
                        ...   
e95                    float64
e96                    float64
e97                    float64
e98                    float64
e99                    float64
Length: 123, dtype: object

In [61]:
cos_similarity = cosine_similarity(normalized_df)

In [81]:
cos_similarity_pdf = pd.DataFrame(cos_similarity)

In [56]:
isbn[7324]

'055321103X'

In [93]:
idcs_pdf = pd.DataFrame([cos_similarity_pdf[i].nlargest(n=6).index for i in cos_similarity_pdf])

In [94]:
isbn_rec_pdf = idcs_pdf.applymap(lambda x: isbn[x])

In [100]:
isbn_rec_pdf.rename(columns = {0: "isbn", 1: 'rec1',  2: 'rec2',  3: 'rec3', 4: 'rec4',  5: 'rec5'}, inplace = True)

In [101]:
isbn_rec_pdf

            isbn        rec1        rec2        rec3        rec4        rec5
0     0006149995  0684800802  0029109752  0029087104  1853267805  067187215X
1     0006552447  055321103X  0515107603  0449230007  0812544056  0553579916
2     0006645097  0712657037  0393312070  0446518123  1564780708  0932379257
3     0026259206  0020228724  0671006762  0671017187  0671537288  0029109752
4     0027653668  0684845180  0020228724  0671749838  0671529072  0684826984
...          ...         ...         ...         ...         ...         ...
9995  0515119989  0515122300  0451195043  0373257090  0373253931  0373288778
9996  0515123854  0618249060  0380756218  0843936819  0440237556  0523418655
9997  0515126500  0345389484  1931847010  1582700036  0425034259  0373029500
9998  0515130397  0373520050  0373520034  0373520565  0373520344  0345329201
9999  0515134023  0877854769  0345287401  0330321250  0440236029  0440402565

[10000 rows x 6 columns]

In [104]:
books_rec = fs.create_feature_group("books_rec",
                                version=1,
                                primary_key=["isbn"],
                                description="Books recommendations",
                                online_enabled=True,
                                time_travel_format=None,
                                statistics_config=False)
books_rec.save(isbn_rec_pdf)