In [7]:
import hsfs
from pyspark.sql import functions as F

connection = hsfs.connection()
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [16]:
# Read source dataset 
books = spark.read.format('csv') \
            .option('header', 'true') \
            .option("quote", "\"") \
            .option("escape", "\"") \
            .load('hdfs:///Projects/book_recommendation/RawData/Books.csv')

users = spark.read.format('csv') \
            .option('header', 'true') \
            .load('hdfs:///Projects/book_recommendation/RawData/Users.csv')

ratings = spark.read.format('csv') \
            .option('header', 'true') \
            .option('inferSchema', 'true') \
            .load('hdfs:///Projects/book_recommendation/RawData/Ratings.csv')

In [17]:
# Books feature engineering
# Drop image columns as they are not needed for our use case
books = books.drop('Image-URL-S', 'Image-URL-M', 'Image-URL-L')

In [18]:
# Data cleaning
# Some rows have wrongly allinged columns that need to be fixed
books = books.withColumn('Year-Of-Publication', 
                             F.when((F.col("ISBN") == "078946697X") | (F.col("ISBN") == "0789466953"), 2000).otherwise(F.col('Year-Of-Publication'))) \
                 .withColumn('Publisher', 
                             F.when((F.col("ISBN") == "078946697X") | (F.col("ISBN") == "0789466953"), "DK Publishing Inc").otherwise(F.col('Publisher'))) \
                 .withColumn('Book-Author', F.when(F.col("ISBN") == "078946697X", "Michael Teitelbaum").otherwise(F.col('Book-Author'))) \
                 .withColumn('Book-Title', F.when(F.col("ISBN") == "078946697X", "DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)").otherwise(F.col('Book-Title'))) \
                 .withColumn('Book-Author', F.when(F.col("ISBN") == "0789466953", "James Buckley").otherwise(F.col('Book-Author'))) \
                 .withColumn('Book-Title', F.when(F.col("ISBN") == "0789466953", "DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)").otherwise(F.col('Book-Title'))) \
                 .withColumn('Book-Author', F.when(F.col("ISBN") == "2070426769", "Jean-Marie Gustave Le ClÃ?Â©zio").otherwise(F.col('Book-Author'))) \
                 .withColumn('Book-Title', F.when(F.col("ISBN") == "2070426769", "Peuple du ciel, suivi de 'Les Bergers").otherwise(F.col('Book-Title'))) \
                 .withColumn('Publisher', F.when(F.col("ISBN") == "2070426769", "Gallimard").otherwise(F.col('Publisher'))) \
                 .withColumn('Year-Of-Publication', F.when(F.col("ISBN") == "2070426769", 2003).otherwise(F.col('Year-Of-Publication'))) 

In [19]:
# Deal with null values in publisher
books = books.withColumn('Publisher', F.when(F.isnull('Publisher'), 'other').otherwise(F.col('Publisher')))

In [20]:
# Deal with weird Years-Of-Publication
books = books.withColumn('Year-Of-Publication', F.col('Year-Of-Publication').cast('int'))

In [21]:
# Register raw books feature group
books = books.selectExpr(["`{}` as `{}`".format(c, c.lower().replace('-', '_')) for c in books.columns])

In [23]:
extra_hudi_options = {
    "hoodie.bulkinsert.shuffle.parallelism":"5",
    "hoodie.insert.shuffle.parallelism":"5", 
    "hoodie.upsert.shuffle.parallelism":"5",
    "hoodie.parquet.compression.ratio":"0.5"
} 

In [22]:
books_meta = fs.create_feature_group("books_raw",
                                version=1,
                                description="Books raw data for feature engineering",
                                online_enabled=False,
                                primary_key=["isbn"])
books_meta.save(books)

In [25]:
ratings = ratings.selectExpr(["`{}` as `{}`".format(c, c.lower().replace('-', '_')) for c in ratings.columns])

In [27]:
ratings_meta = fs.create_feature_group("ratings_raw",
                                version=1,
                                description="Ratings raw data for feature engineering",
                                online_enabled=False,
                                primary_key=["isbn", "user_id"])

ratings_meta.save(ratings, extra_hudi_options)

In [29]:
users = users.selectExpr(["`{}` as `{}`".format(c, c.lower().replace('-', '_')) for c in users.columns])

In [34]:
users = users.filter(users.user_id.isNotNull())

In [36]:
users_meta = fs.create_feature_group("users_raw",
                                version=1,
                                description="Users raw data for feature engineering",
                                online_enabled=False,
                                primary_key=["user_id"])

users_meta.save(users, extra_hudi_options)