# CS-GY 6513 Final Project - Book Recommendation

## Preliminary

In [1]:

import torch
import pyspark.sql.functions as F

from collections import Counter
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession, DataFrame
from pyspark.ml.feature import Tokenizer, Word2Vec, StringIndexer
from pyspark.sql.types import ArrayType, FloatType, StringType, StructType, StructField, DateType

### Constants

In [2]:
APP_NAME = "BooksDataWordEmbedding"
BOOKS_DATA_FILE_PATH = "./data/books_data.csv"
BOOK_RATING_FILE_PATH = "./data/Books_rating.csv"
COLUMNS_TO_EMBED = ["Title", "description", "authors"]
TOKENS_COLUMN_SUFFIX = "_tokens"
EMBEDDING_COLUMN_SUFFIX = "_embeddings"

RAW_DATA_SCHEMA = StructType(
    [
        StructField("Title", StringType(), False),
        StructField("description", StringType(), True),
        StructField("authors", StringType(), True),
        StructField(
            "image",
            StringType(),
            True,
        ),
        StructField("previewLink", StringType(), True),
        StructField("publisher", StringType(), True),
        StructField("publishedDate", DateType(), True),
        StructField("infoLink", StringType(), True),
        StructField("categories", StringType(), True),
        StructField("ratingsCount", FloatType(), True),
    ]
)
RAW_DATA_COLUMNS_TO_DROP = ["image", "previewLink", "publishedDate", "infoLink", "ratingsCount"]
RAW_DATA_STRING_ARRAY_FIELDS = ["authors", "categories"]

COLUMNS_TO_USE_FOR_CATEGORY_FILLING = {
    "Title_embeddings": 1,
    "description_embeddings": 0.5,
}

### Useful Methods

In [3]:
def show_dataframe(
    dataframe: DataFrame, num_rows_to_show: int = 10, prefix: str = "", suffix: str = ""
) -> None:
    prefix = prefix + ":\n" if prefix else prefix
    suffix = suffix + "\n" if suffix else suffix
    print(f"------------\n{prefix}")
    dataframe.show(num_rows_to_show)
    print(f"{suffix}------------\n")

### Spark Config

In [4]:
spark = (
    SparkSession.builder.appName(APP_NAME)
    .config("spark.driver.extraJavaOptions", "-XX:ReservedCodeCacheSize=2048m")
    .config("spark.executor.extraJavaOptions", "-XX:ReservedCodeCacheSize=2048m")
    .config("spark.executor.memory", "16g")
    .config("spark.driver.memory", "16g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/15 11:26:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Read Data & Null Handling

In [65]:
data = spark.read.csv(BOOKS_DATA_FILE_PATH, header=True, schema=RAW_DATA_SCHEMA)
for column in COLUMNS_TO_EMBED:
    data = data.withColumn(column, F.when(F.col(column).isNull(), "N/A").otherwise(F.col(column)))

for column in RAW_DATA_STRING_ARRAY_FIELDS:
    data = data.withColumn(
        column, F.regexp_replace(F.regexp_replace(data[column], r"[\[\]'\s]", ""), r",", " ")
    )

for column in RAW_DATA_COLUMNS_TO_DROP:
    data = data.drop(F.col(column))

show_dataframe(data, prefix="Books Data")

------------
Books Data:

+--------------------+--------------------+------------------+--------------------+--------------------+
|               Title|         description|           authors|           publisher|          categories|
+--------------------+--------------------+------------------+--------------------+--------------------+
|Its Only Art If I...|                 N/A|       JulieStrain|                NULL|Comics&GraphicNovels|
|Dr. Seuss: Americ...|Philip Nel takes ...|         PhilipNel|           A&C Black|Biography&Autobio...|
|Wonderful Worship...|This resource inc...|        DavidR.Ray|                NULL|            Religion|
|Whispers of the W...|Julia Thomas find...|    VeronicaHaddon|           iUniverse|             Fiction|
|Nation Dance: Rel...|                 N/A|        EdwardLong|                NULL|                NULL|
|The Church of Chr...|In The Church of ...|   EverettFerguson|Wm. B. Eerdmans P...|            Religion|
|The Overbury affa...|       

## Process Data

In [59]:
tokenizers = [
    Tokenizer(inputCol=column, outputCol=f"{column}{TOKENS_COLUMN_SUFFIX}")
    for column in COLUMNS_TO_EMBED
]

word2Vecs = [
    Word2Vec(
        vectorSize=100,
        minCount=0,
        inputCol=f"{column}{TOKENS_COLUMN_SUFFIX}",
        outputCol=f"{column}{EMBEDDING_COLUMN_SUFFIX}",
    )
    for column in COLUMNS_TO_EMBED
]

stages = tokenizers + word2Vecs
word_embedding_pipeline = Pipeline(stages=stages)

model = word_embedding_pipeline.fit(data)
result = model.transform(data)

show_dataframe(result, prefix="Word embedding result")

                                                                                

------------
Word embedding result:

+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+----------------------+--------------------+
|               Title|         description|           authors|               image|         previewLink|           publisher|publishedDate|            infoLink|          categories|ratingsCount|        Title_tokens|  description_tokens|      authors_tokens|    Title_embeddings|description_embeddings|  authors_embeddings|
+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+----------------------+----------------

### Save as JSON

**Drop Un-necessary Columns**

In [64]:
result.write.json("./result/books_data_embedding")

                                                                                

## Missing Book Category Label Filling - Cosine Similarity

### Preparation

**Read in Data**

In [5]:
df = spark.read.json("./result/books_data_embedding/part*.json")

                                                                                

In [6]:
show_dataframe(df)

------------

+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+--------------------+----------------------+--------------------+--------------------+
|               Title|    Title_embeddings|        Title_tokens|           authors|  authors_embeddings|      authors_tokens|          categories|         description|description_embeddings|  description_tokens|           publisher|
+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+--------------------+----------------------+--------------------+--------------------+
|Its Only Art If I...|{1, [0.0450182360...|[its, only, art, ...|       JulieStrain|{1, [-0.003502663...|       [juliestrain]|Comics&GraphicNovels|                 N/A|  {NULL, NULL, 1, [...|               [n/a]|                NULL|
|Dr. Seuss: Americ...|{1, [0.0626922780...|[dr., seuss

**Partition Rows based on if Categories Column is Missing**

In [7]:
non_empty_df = df.filter(F.col("categories").isNotNull())
empty_df = df.filter(F.col("categories").isNull())

**Extract the Embeddings**

In [8]:
def extract_embeddings(embedding):
    if embedding and "values" in embedding:
        return embedding["values"]
    return []


In [9]:
extract_embeddings_udf = F.udf(extract_embeddings, ArrayType(FloatType()))


for column in COLUMNS_TO_USE_FOR_CATEGORY_FILLING.keys():
    non_empty_df = non_empty_df.withColumn(column, extract_embeddings_udf(F.col(column)))
    empty_df = empty_df.withColumn(column, extract_embeddings_udf(F.col(column)))

**Convert to Torch Tensor**

In [30]:
reference_data = non_empty_df.select(
    *list(COLUMNS_TO_USE_FOR_CATEGORY_FILLING.keys()), "categories"
)
for column in COLUMNS_TO_USE_FOR_CATEGORY_FILLING:
    reference_data = reference_data.filter(F.size(F.col(column)) > 0)
reference_data = reference_data.collect()

title_reference_embeddings = torch.tensor(
    [row["Title_embeddings"] for row in reference_data],
    dtype=torch.float32,
)
description_reference_embeddings = title_reference_embeddings = torch.tensor(
    [row["description_embeddings"] for row in reference_data],
    dtype=torch.float32,
)

reference_categories = [row["categories"] for row in reference_data]

                                                                                

### Calculate Cosine Similarity and Fill the Empty Categories

In [31]:
def predict_category(
    target_row,
    title_reference_embeddings,
    description_reference_embeddings,
    reference_categories,
    columns_weight_map: dict,
):
    combined_similarities = None
    for column, weight in columns_weight_map.items():
        target_embedding = torch.tensor(target_row[column], dtype=torch.float32).unsqueeze(0)
        similarities = torch.nn.functional.cosine_similarity(
            target_embedding,
            (
                title_reference_embeddings
                if column == "Title_embeddings"
                else description_reference_embeddings
            ),
            dim=1,
        )

        if combined_similarities is None:
            combined_similarities = similarities * weight
        else:
            combined_similarities += similarities * weight

    top_k_indices = torch.topk(combined_similarities, k=10).indices.numpy()
    top_categories = []
    for i in top_k_indices:
        top_categories.extend(reference_categories[i].split(" "))

    most_common_category = Counter(top_categories).most_common(1)[0][0]
    return most_common_category


predict_category_udf = F.udf(
    lambda x: predict_category(
        x,
        title_reference_embeddings,
        description_reference_embeddings,
        reference_categories,
        COLUMNS_TO_USE_FOR_CATEGORY_FILLING,
    ),
    StringType(),
)


filled_empty_df = empty_df.withColumn(
    "predicted_category",
    predict_category_udf(F.struct(*COLUMNS_TO_USE_FOR_CATEGORY_FILLING.keys())),
)

In [35]:
updated_df = df.join(
    filled_empty_df.select("Title", F.col("predicted_category").alias("predicted_category")),
    on="Title",
    how="left_outer"
).withColumn(
    "categories",
    F.when(F.col("predicted_category").isNotNull(), F.col("predicted_category"))
    .otherwise(F.col("categories"))
).drop("predicted_category")

updated_df.cache()

DataFrame[Title: string, Title_embeddings: struct<type:bigint,values:array<double>>, Title_tokens: array<string>, authors: string, authors_embeddings: struct<type:bigint,values:array<double>>, authors_tokens: array<string>, categories: string, description: string, description_embeddings: struct<indices:array<string>,size:bigint,type:bigint,values:array<double>>, description_tokens: array<string>, publisher: string]

[Stage 17:> (0 + 10) / 13][Stage 18:>  (0 + 0) / 13][Stage 19:>  (0 + 0) / 13]

In [36]:
empty_categories_count = updated_df.filter(F.col("categories").isNull()).count()

if empty_categories_count > 0:
    print(f"There are {empty_categories_count} rows with empty categories.")
else:
    print("All rows have non-empty categories.")



All rows have non-empty categories.


                                                                                

In [37]:
show_dataframe(updated_df, prefix="After filling categories")

------------
After filling categories:

+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+--------------------+--------------------+----------------------+--------------------+--------------------+
|               Title|    Title_embeddings|        Title_tokens|        authors|  authors_embeddings|   authors_tokens|          categories|         description|description_embeddings|  description_tokens|           publisher|
+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+--------------------+--------------------+----------------------+--------------------+--------------------+
|Isaac Asimov: Mas...|{1, [-0.044200854...|[isaac, asimov:, ...|    KarenJudson|{1, [6.5018417080...|    [karenjudson]|  JuvenileNonfiction|Details the life ...|  {NULL, NULL, 1, [...|[details, the, li...|Enslow Pub Incorp...|
|     White Rock Ways|{1, [-0.030555841...| [white, 

In [38]:
output_path = "./result/books_data_categories_filled"
columns_to_drop = ["Title_embeddings", "description_embeddings", "authors_embeddings"]
cleaned_df = updated_df.drop(*columns_to_drop)


cleaned_df.write.option("header", True).mode("overwrite").json(output_path)

print(f"DataFrame has been written to {output_path} as JSON without specified columns.")

                                                                                

DataFrame has been written to ./result/books_data_categories_filled as JSON without specified columns.


**Read in the Data to Verify Output Result and for Later Use**

In [39]:
df_category_full = spark.read.json("./result/books_data_categories_filled/part*.json").select(*["Title", "categories"])
show_dataframe(df_category_full)

------------

+--------------------+--------------------+
|               Title|          categories|
+--------------------+--------------------+
|     Behind the Moon|    Adventurestories|
|Living in Chaos i...|    Body Mind&Spirit|
|CHEMISTRY IN AGRI...|Agriculturalchemi...|
|Improving Governa...|    PoliticalScience|
|         Study Guide|  Business&Economics|
|Something more (T...|       Christianlife|
|Woman at the wind...|             Fiction|
|Straight from the...|      Health&Fitness|
|Mr. Cheap's Atlan...|              Travel|
|Hermsprong Or Man...|        Biblestories|
+--------------------+--------------------+
only showing top 10 rows

------------



# Collaborative Filtering

## Read Data and Sparse Handling

In [6]:
data = spark.read.csv(BOOK_RATING_FILE_PATH, header=True, inferSchema=True)

show_dataframe(data, prefix="Books Rating")



------------
Books Rating:

+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|        Id|               Title|Price|       User_id|         profileName|review/helpfulness|review/score|review/time|      review/summary|         review/text|
+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|1882931173|Its Only Art If I...| NULL| AVCGYZL8FQQTD|"Jim of Oz ""jim-...|               7/7|         4.0|  940636800|Nice collection o...|This is only for ...|
|0826414346|Dr. Seuss: Americ...| NULL|A30TK6U7DNS82R|       Kevin Killian|             10/10|         5.0| 1095724800|   Really Enjoyed It|I don't care much...|
|0826414346|Dr. Seuss: Americ...| NULL|A3UH4UZ4RSVO82|        John Granger|             10/11|         5.0| 1078790400|Essential for eve...|"If people become...|


                                                                                

In [7]:
print("Original Data Size", data.count())

count_df = data.groupBy("User_id").agg(F.count("*").alias("record_count"))
# keep user with record number >= 10
filtered_df = count_df.filter(F.col("record_count") >= 10)
data = data.join(filtered_df, on="User_id", how="inner")

print("Filtered Data Size", data.count())

Original Data Size 3000000




CodeCache: size=2097152Kb used=39299Kb max_used=39304Kb free=2057852Kb
 bounds [0x0000000300000000, 0x0000000302690000, 0x0000000380000000]
 total_blobs=12844 nmethods=11894 adapters=861
 compilation: disabled (not enough contiguous free space left)


[Stage 12:>                                                       (0 + 10) / 11]

Filtered Data Size 819551


                                                                                

In [8]:
# join with the books_data to get the book category
data = data.join(df_category_full, on="Title", how="inner")


print("Data Size after Adding Book Categories", data.count())



Data Size after Adding Book Categories 816451


                                                                                

## Generate UserId and BookId

In [9]:
user_indexer = StringIndexer(inputCol="User_id", outputCol="UserId")
data = user_indexer.fit(data).transform(data)

book_indexer = StringIndexer(inputCol="Id", outputCol="BookId")
data = book_indexer.fit(data).transform(data)

data = data \
        .withColumn("UserId", F.col("UserId").cast("int")) \
        .withColumn("BookId", F.col("BookId").cast("int")) \
        .withColumn("rating", F.col("review/score").cast("float"))

# Remove nulls and NaNs
data = data.dropna(subset=["rating"])
data = data.filter(F.col("rating").isNotNull())

show_dataframe(data, prefix="Books Rating")

                                                                                

------------
Books Rating:



24/12/15 10:27:24 WARN DAGScheduler: Broadcasting large task binary with size 5.4 MiB


+--------------------+--------------+----------+-----+--------------------+------------------+------------+-----------+--------------------+--------------------+------------+------------------+------+------+------+
|               Title|       User_id|        Id|Price|         profileName|review/helpfulness|review/score|review/time|      review/summary|         review/text|record_count|        categories|UserId|BookId|rating|
+--------------------+--------------+----------+-----+--------------------+------------------+------------+-----------+--------------------+--------------------+------------+------------------+------+------+------+
|'AN ESSAY CONCERN...| AHD101501WCN1|B0006DBRS0| NULL|"Shalom Freedman ...|               5/6|         5.0| 1106092800|One of the major ...|It has been many ...|        1994|Knowledge Theoryof|     3| 18089|   5.0|
|'AN ESSAY CONCERN...| AK81WLVD5KGUX|B0006DBRS0| NULL|"John S. Ryan ""S...|             50/52|         5.0|  933552000|A highly readable...|

                                                                                

## Split training and test set

In [10]:
(training, test) = data.randomSplit([0.8, 0.2], seed=42)
print("Training Data Size", training.count())
print("Test Data Size", test.count())

24/12/15 10:27:36 WARN DAGScheduler: Broadcasting large task binary with size 5.4 MiB
                                                                                

Training Data Size 646021


24/12/15 10:27:50 WARN DAGScheduler: Broadcasting large task binary with size 5.4 MiB

Test Data Size 161504


                                                                                

## Train

In [11]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [12]:
spark.sparkContext.setLogLevel("ERROR")

In [13]:
als = ALS(
    userCol="UserId",          # Numeric User ID column
    itemCol="BookId",          # Numeric Book ID column
    ratingCol="rating",        # Rating column
    implicitPrefs=False,       # Explicit feedback (ratings)
    coldStartStrategy="drop",  # Drop predictions with NaN
    rank=100,                  # since the dataset is sparse, start with higher latent vec dimensionality
    regParam=0.1,
    maxIter=10,
)

model = als.fit(training)

                                                                                

In [24]:
predictions = (
    model.transform(test)
    .withColumn(
        "prediction",
        F.when(F.col("prediction") < 0, 0.0)
        .when(F.col("prediction") > 5, 5.0)
        .otherwise(F.col("prediction")),
    )
)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")



Root Mean Squared Error (RMSE): 0.8175549197119781


                                                                                

## Hybrid with Category Filtering

### Find User's Top Book Categories

In [15]:
data = data.withColumn("category", F.explode(F.split(data["categories"], " ")))
user_category_pref = data.groupBy("UserId", "category").agg(F.avg("rating").alias("avg_rating"))
user_top_categories = user_category_pref.filter(user_category_pref["avg_rating"] > 3)
user_top_categories_agg = user_top_categories.groupBy("UserId").agg(F.collect_list("category").alias("top_categories"))
show_dataframe(user_top_categories_agg)

------------





+------+--------------------+
|UserId|      top_categories|
+------+--------------------+
|    12|[Art, Arcticregio...|
|    22|[Mathematics, Tra...|
|    26|[Cooking, Governe...|
|    27|[Audioequipmentin...|
|    28|[British, William...|
|    31|[LiteraryCriticis...|
|    34|[PerformingArts, ...|
|    44|[History, Biograp...|
|    47|[Pets, Oceania, C...|
|    53|[Religion, POETRY...|
+------+--------------------+
only showing top 10 rows

------------



                                                                                

### Generate the Recommendations and Filter based on User's Top Book Categories

In [16]:
user_recommendations = model.recommendForAllUsers(10)
exploded_user_recommendations = user_recommendations.withColumn(
    "recommendation", F.explode(F.col("recommendations"))
)
exploded_user_recommendations = exploded_user_recommendations.select(
    F.col("UserId"),
    F.col("recommendation.BookId").alias("BookId"),
    F.col("recommendation.rating").alias("PredictedRating"),
)

book_categories = data.select(F.col("BookId"), F.col("categories")).withColumn(
    "categories_array", F.split(F.col("categories"), " ")
)

recommendations_with_categories = exploded_user_recommendations.join(
    book_categories, on="BookId", how="inner"
)

recommendations_hybrid = recommendations_with_categories.join(
    user_top_categories_agg, on="UserId", how="inner"
).filter(F.array_intersect(F.col("categories_array"), F.col("top_categories")).isNotNull())

# re-group the recommendations
recommendations_hybrid = recommendations_hybrid.groupBy("UserId").agg(
    F.collect_list(F.struct("BookId", "PredictedRating")).alias("recommendations")
)

recommendations_hybrid.show(truncate=False)



+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|UserId|recommendations                                                                                                                                                                                                

                                                                                

In [17]:
book_recommendations = model.recommendForAllItems(10)
book_recommendations.show(truncate=False)



+------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|BookId|recommendations                                                                                                                                                                                    |
+------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|28    |[{576, 5.011198}, {4732, 4.909902}, {2306, 4.908149}, {24734, 4.908126}, {24818, 4.9033427}, {25053, 4.9001594}, {23204, 4.8997374}, {21020, 4.899671}, {3813, 4.8979025}, {4803, 4.897622}]       |
|31    |[{576, 5.004006}, {4732, 4.916149}, {24734, 4.908223}, {25053, 4.9061503}, {9503, 4.9059367}, {24818, 4.905873}, {23204, 4.9046307}, {4803, 4.903374}, {21020, 4.9030294}, {

                                                                                

## Convert bookId/UserId to bookTitle/User_id

In [19]:
exploded_user_recommendations = recommendations_hybrid.withColumn(
    "recommendation", F.explode(F.col("recommendations"))
)
exploded_user_recommendations = exploded_user_recommendations.select(
    F.col("UserId"),
    F.col("recommendation.BookId").alias("BookId"),
    F.col("recommendation.PredictedRating").alias("PredictedRating"),
)

In [20]:
# map book id to title
book_id_to_title = data.select(F.col("BookId"), F.col("Title")).distinct()

joined_df = exploded_user_recommendations.join(book_id_to_title, on="BookId", how="left").drop(data["UserId"]).select(
    F.col("UserId"),
    F.col("Title"),
)
joined_df.show()

                                                                                

+------+--------------------+
|UserId|               Title|
+------+--------------------+
|    26|Elephant hunting ...|
|    26|Penny Plain [Hard...|
|    26|Testament: The Bi...|
|    26|Thompson Chain-Re...|
|    26|Teacup Full of Roses|
|    26|Twin Stories: The...|
|    26|Sum & Substance: ...|
|    26|Mary Had a Little...|
|    26|   Hey Diddle Diddle|
|    26|The Fourth Networ...|
|    26|The Fourth Networ...|
|    26|The Fourth Networ...|
|    26|The Fourth Networ...|
|    27|Elephant hunting ...|
|    27|Penny Plain [Hard...|
|    27|Thompson Chain-Re...|
|    27|Teacup Full of Roses|
|    27|The advance of sc...|
|    27|The advance of sc...|
|    27|Mary Had a Little...|
+------+--------------------+
only showing top 20 rows



In [21]:
# map UserId to User_id
userId_to_id = data.select(F.col("UserId"), F.col("User_id")).distinct()

joined_df = joined_df.join(userId_to_id, on="UserId", how="left").select(
    F.col("User_id"),
    F.col("Title"),
)

joined_df.show()

                                                                                

+--------------+--------------------+
|       User_id|               Title|
+--------------+--------------------+
|A1T17LMQABMBN5|Elephant hunting ...|
|A1T17LMQABMBN5|Penny Plain [Hard...|
|A1T17LMQABMBN5|Testament: The Bi...|
|A1T17LMQABMBN5|Thompson Chain-Re...|
|A1T17LMQABMBN5|Teacup Full of Roses|
|A1T17LMQABMBN5|Twin Stories: The...|
|A1T17LMQABMBN5|Sum & Substance: ...|
|A1T17LMQABMBN5|Mary Had a Little...|
|A1T17LMQABMBN5|   Hey Diddle Diddle|
|A1T17LMQABMBN5|The Fourth Networ...|
|A1T17LMQABMBN5|The Fourth Networ...|
|A1T17LMQABMBN5|The Fourth Networ...|
|A1T17LMQABMBN5|The Fourth Networ...|
| AHXAPVSHPJ6OJ|Elephant hunting ...|
| AHXAPVSHPJ6OJ|Penny Plain [Hard...|
| AHXAPVSHPJ6OJ|Thompson Chain-Re...|
| AHXAPVSHPJ6OJ|Teacup Full of Roses|
| AHXAPVSHPJ6OJ|The advance of sc...|
| AHXAPVSHPJ6OJ|The advance of sc...|
| AHXAPVSHPJ6OJ|Mary Had a Little...|
+--------------+--------------------+
only showing top 20 rows



In [22]:
result_df = joined_df.groupBy("User_id").agg(F.collect_list("Title").alias("books"))
show_dataframe(result_df, prefix="Final Result")

------------
Final Result:



[Stage 877:>                                                      (0 + 10) / 10]

+--------------------+--------------------+
|             User_id|               books|
+--------------------+--------------------+
|A0469729ADTHXTW0CPIS|[Elephant hunting...|
|A0919846H34XADJMF99R|[Elephant hunting...|
|      A100NGGXRQF0AQ|[Elephant hunting...|
|      A101DG7P9E26PW|[Elephant hunting...|
|      A102P9UKBY9P75|[Elephant hunting...|
|      A102VPNZTRP1YA|[Elephant hunting...|
|      A1042BIXF6ZMAC|[Elephant hunting...|
|      A105L4AE1HAC4Y|[Elephant hunting...|
|      A1065304SY1HF8|[Elephant hunting...|
|      A106RLZK9HQIFS|[Elephant hunting...|
+--------------------+--------------------+
only showing top 10 rows

------------



                                                                                