# CS-GY 6513 Final Project - Book Recommendation

## Preliminary

In [1]:
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession, DataFrame
from pyspark.ml.feature import Tokenizer, Word2Vec

### Constants

In [2]:
APP_NAME = "BooksDataWordEmbedding"
BOOKS_DATA_FILE_PATH = "./data/books_data.csv"
COLUMNS_TO_EMBED = ["Title", "description", "authors"]
TOKENS_COLUMN_SUFFIX = "_tokens"
EMBEDDING_COLUMN_SUFFIX = "_embeddings"

### Useful Methods

In [3]:
def show_dataframe(
    dataframe: DataFrame, num_rows_to_show: int = 10, prefix: str = "", suffix: str = ""
) -> None:
    prefix = prefix + ":\n" if prefix else prefix
    suffix = suffix + "\n" if suffix else suffix
    print(f"------------\n{prefix}")
    dataframe.show(num_rows_to_show)
    print(f"{suffix}------------\n")

### Spark Config

In [4]:
spark = (
    SparkSession.builder.appName(APP_NAME)
    .config("spark.driver.extraJavaOptions", "-XX:ReservedCodeCacheSize=1024m")
    .config("spark.executor.extraJavaOptions", "-XX:ReservedCodeCacheSize=1024m")
    .config("spark.executor.memory", "8g")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/06 10:24:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Read Data & Null Handling

In [5]:
data = spark.read.csv(BOOKS_DATA_FILE_PATH, header=True, inferSchema=True)
for column in COLUMNS_TO_EMBED:
    data = data.withColumn(
        column, F.when(F.col(column).isNull(), "unknown").otherwise(F.col(column))
    )

show_dataframe(data, prefix="Books Data")

------------
Books Data:

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+------------+
|               Title|         description|             authors|               image|         previewLink|           publisher| publishedDate|            infoLink|          categories|ratingsCount|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+------------+
|Its Only Art If I...|             unknown|    ['Julie Strain']|http://books.goog...|http://books.goog...|                NULL|          1996|http://books.goog...|['Comics & Graphi...|        NULL|
|Dr. Seuss: Americ...|"Philip Nel takes...| like that of Lew...| has changed lang...| giving us new wo...| inspiring artist...|['Philip Nel']|http://books.goog...|http://books.goog..

## Process Data

In [6]:
tokenizers = [
    Tokenizer(inputCol=column, outputCol=f"{column}{TOKENS_COLUMN_SUFFIX}")
    for column in COLUMNS_TO_EMBED
]

word2Vecs = [
    Word2Vec(
        vectorSize=100,
        minCount=0,
        inputCol=f"{column}{TOKENS_COLUMN_SUFFIX}",
        outputCol=f"{column}{EMBEDDING_COLUMN_SUFFIX}",
    )
    for column in COLUMNS_TO_EMBED
]

stages = tokenizers + word2Vecs
word_embedding_pipeline = Pipeline(stages=stages)

model = word_embedding_pipeline.fit(data)
result = model.transform(data)

show_dataframe(result, prefix="Word embedding result")



CodeCache: size=1048576Kb used=32020Kb max_used=32046Kb free=1016556Kb
 bounds [0x0000000127800000, 0x0000000129770000, 0x0000000167800000]
 total_blobs=11292 nmethods=10308 adapters=896
 compilation: disabled (not enough contiguous free space left)


24/12/06 10:24:55 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

------------
Word embedding result:

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+----------------------+--------------------+
|               Title|         description|             authors|               image|         previewLink|           publisher| publishedDate|            infoLink|          categories|ratingsCount|        Title_tokens|  description_tokens|      authors_tokens|    Title_embeddings|description_embeddings|  authors_embeddings|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+----------------------+-------

### Save as JSON

In [13]:
# # flatten the array-like columns to comma-separated strings
# # so that it can be saved as csv
# tokens_columns_to_concat = [f"{column}{TOKENS_COLUMN_SUFFIX}" for column in COLUMNS_TO_EMBED]
# embedding_columns_to_transform_and_concat = [
#     f"{column}{EMBEDDING_COLUMN_SUFFIX}" for column in COLUMNS_TO_EMBED
# ]
# for column in tokens_columns_to_concat:
#     result = result.withColumn(column, F.concat_ws(",", F.col(column)))

# for column in embedding_columns_to_transform_and_concat:
#     result = result.withColumn(f"{column}_values", F.concat_ws(",", F.col(f"{column}.values")))
#     result.drop(column)

result.write.json("./result/books_data_embedding")

                                                                                