In [1]:
import pyspark
from pyspark.sql import SparkSession 

In [2]:
spark = SparkSession.builder \
    .appName("ContentRecommendationEDA")\
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

25/07/21 21:32:11 WARN Utils: Your hostname, vaibhavi-HP-Laptop-15-fd0xxx resolves to a loopback address: 127.0.1.1; using 192.168.0.128 instead (on interface wlo1)
25/07/21 21:32:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/21 21:32:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.parquet("/home/vaibhavi/spark-ml-venv/ml_project/preprocessing/output/content/*")
#/home/vaibhavi/spark-ml-venv/ml_project/preprocessing/output


                                                                                

In [4]:
df.printSchema()

root
 |-- Title: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- review_count: long (nullable = true)



In [5]:
# checking the null values in th df
from pyspark.sql.functions import col, sum

null_counts = df.select([
    sum(col(c).isNull().cast("int")).alias(c) for c in df.columns
])

null_counts.show()


+-----+-------+----------+------------+
|Title|authors|categories|review_count|
+-----+-------+----------+------------+
|    0|  31103|     40011|           0|
+-----+-------+----------+------------+



In [6]:
df.select("Title").distinct().count()
#df.count()


188980

In [7]:
df.count()

188980

In [8]:
#there are 188980 distinct itles, and total 188980 titles, so it is not needed to get them one hot encoded or numbered as they are different for each entry.


In [9]:
print(df.select("authors").distinct().count())

print(df.select("categories").distinct().count())


112894
10409


In [10]:
df.show()

+--------------------+--------------------+--------------------+------------+
|               Title|             authors|          categories|review_count|
+--------------------+--------------------+--------------------+------------+
|"""Carefree"" (R....|['Allan Scott', '...|                NULL|           1|
|"""Glory is a-com...|['Martha Peterson...|         ['Indiana']|           2|
|"""I Do""...Weddi...|                NULL|['Business & Econ...|          12|
|"""I just got a j...|                NULL|                NULL|          17|
|"""Little Rainman...|['Karen L. Simmons']|['Autistic childr...|           9|
|"""Nothing but pr...|                NULL|         ['History']|           1|
|"""Purse""onalize...|['Andrews McMeel ...|                NULL|           1|
|"""What shall we ...|   ['Clarence Cook']|                NULL|           1|
|"Confessions of a...|['Ed Roth', 'Howi...|['Biography & Aut...|           2|
|"Discovery of the...|['Robert D. Balla...|       ['Derelicts']|

In [11]:
# that means there are repeated authors and categories. this results in sparse matrices with traditional trechniques of one hot encoding or string indexer.
# so we will use word2vec for generating the final vector embedding for the cosine  similarity!
from pyspark.sql.functions import when , col

df_final = df \
    .withColumn("authors", when(col("authors").isNull(), "Unknown").otherwise(col("authors"))) \
    .withColumn("categories", when(col("categories").isNull(), "Unknown").otherwise(col("categories"))) \
    .withColumn("review_count", when(col("review_count").isNull(), 0).otherwise(col("review_count"))) \
    .filter(col("Title").isNotNull())


In [12]:
df_final.show()

+--------------------+--------------------+--------------------+------------+
|               Title|             authors|          categories|review_count|
+--------------------+--------------------+--------------------+------------+
|"""Carefree"" (R....|['Allan Scott', '...|             Unknown|           1|
|"""Glory is a-com...|['Martha Peterson...|         ['Indiana']|           2|
|"""I Do""...Weddi...|             Unknown|['Business & Econ...|          12|
|"""I just got a j...|             Unknown|             Unknown|          17|
|"""Little Rainman...|['Karen L. Simmons']|['Autistic childr...|           9|
|"""Nothing but pr...|             Unknown|         ['History']|           1|
|"""Purse""onalize...|['Andrews McMeel ...|             Unknown|           1|
|"""What shall we ...|   ['Clarence Cook']|             Unknown|           1|
|"Confessions of a...|['Ed Roth', 'Howi...|['Biography & Aut...|           2|
|"Discovery of the...|['Robert D. Balla...|       ['Derelicts']|

In [13]:
df_final.count()

188980

In [14]:
# for the cosine model, we need vectors
# title is done using tf-idf vectorizer

from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, HashingTF, IDF

# Tokenize title
tokenizer = RegexTokenizer(inputCol="Title", outputCol="title_tokens", pattern="\\W")
remover = StopWordsRemover(inputCol="title_tokens", outputCol="title_filtered")

# TF-IDF
hashingTF = HashingTF(inputCol="title_filtered", outputCol="title_tf", numFeatures=500)
idf = IDF(inputCol="title_tf", outputCol="title_tfidf")


In [15]:
#word2vec for authors and categories
from pyspark.ml.feature import Word2Vec, Tokenizer

# Tokenize authors on comma
author_tokenizer = RegexTokenizer(inputCol="authors", outputCol="author_tokens", pattern="\\s*,\\s*")

# Word2Vec
author_w2v = Word2Vec(vectorSize=50, minCount=1, inputCol="author_tokens", outputCol="author_vec")


In [16]:
# Clean brackets and split by comma
category_tokenizer = RegexTokenizer(inputCol="categories", outputCol="category_tokens", pattern="\\s*,\\s*|\\[|\\]|'")
category_w2v = Word2Vec(vectorSize=30, minCount=1, inputCol="category_tokens", outputCol="category_vec")


In [17]:
from pyspark.ml.feature import MinMaxScaler, VectorAssembler

review_assembler = VectorAssembler(inputCols=["review_count"], outputCol="review_count_vec")
review_scaler = MinMaxScaler(inputCol="review_count_vec", outputCol="review_count_scaled")


In [18]:
df_final.printSchema()

root
 |-- Title: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- review_count: long (nullable = true)



In [19]:
# final combining all the features
final_assembler = VectorAssembler(
    inputCols=["title_tfidf", "author_vec", "category_vec", "review_count_scaled"],
    outputCol="final_features"
)


In [20]:
#pipeline for the eda stesp!
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[
    tokenizer, remover, hashingTF, idf,
    author_tokenizer, author_w2v,
    category_tokenizer, category_w2v,
    review_assembler, review_scaler,
    final_assembler
])


In [21]:
#executing the pipeline - 
model = pipeline.fit(df_final)
df_vectorized = model.transform(df_final)


                                                                                

In [22]:
df_vectorized.show(10)

+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+
|               Title|             authors|          categories|review_count|        title_tokens|      title_filtered|            title_tf|         title_tfidf|       author_tokens|          author_vec|     category_tokens|        category_vec|review_count_vec| review_count_scaled|      final_features|
+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+
|"""Carefree"" (R....|['Allan Scott', '...|             Unknown|           1|[carefre

In [23]:
df_vectorized.select("Title", "final_features").write.mode("overwrite").parquet("vectorized_books.parquet")


                                                                                

In [24]:
df_vectorized.count()

188980

In [25]:
from pyspark.sql import functions as F


In [26]:
df_vectorized.printSchema()

root
 |-- Title: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- title_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title_tf: vector (nullable = true)
 |-- title_tfidf: vector (nullable = true)
 |-- author_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- author_vec: vector (nullable = true)
 |-- category_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category_vec: vector (nullable = true)
 |-- review_count_vec: vector (nullable = true)
 |-- review_count_scaled: vector (nullable = true)
 |-- final_features: vector (nullable = true)



In [29]:
from pyspark.sql.functions import col

df_vectorized.filter(col("review_count") > 5000).select("title").show(truncate=False)


+-------------------------------------+
|title                                |
+-------------------------------------+
|Atlas Shrugged                       |
|The Hobbit                           |
|The Great Gatsby                     |
|Brave New World                      |
|Of Mice and Men                      |
|The Giver                            |
|The Picture of Dorian Gray           |
|Persuasion                           |
|Great Expectations                   |
|Pride and Prejudice                  |
|Mere Christianity                    |
|Wuthering Heights                    |
|Harry Potter and The Sorcerer's Stone|
+-------------------------------------+



In [32]:
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col, udf
from pyspark.ml.functions import vector_to_array
from pyspark.sql.types import DoubleType
import numpy as np
import pandas as pd

In [37]:
book_vector = df_vectorized.filter(col("Title") == "The Hobbit").select("final_features").first()["final_features"]
broadcast_vec = spark.sparkContext.broadcast(book_vector.toArray())


In [38]:
def cosine_sim(vec):
    if vec is None:
        return 0.0
    vec1 = vec.toArray()
    vec2 = broadcast_vec.value
    dot = float(np.dot(vec1, vec2))
    norm = np.linalg.norm(vec1) * np.linalg.norm(vec2)
    return float(dot / norm) if norm != 0 else 0.0

cosine_udf = udf(cosine_sim, DoubleType())


In [39]:
df_sim = df_vectorized.withColumn("similarity", cosine_udf("final_features"))
df_sim.orderBy(col("similarity").desc()).select("title", "similarity").show(10, truncate=False)




+-----------------------------------------------------+------------------+
|title                                                |similarity        |
+-----------------------------------------------------+------------------+
|The Hobbit                                           |1.0000000000000002|
|Crucible, The                                        |0.9789467997140651|
|Caldecott                                            |0.9783885673884307|
|Q                                                    |0.9783778589111998|
|The Vision                                           |0.9783415580575551|
|Sarkhan                                              |0.9783331754005907|
|Remote                                               |0.9783030761963375|
|Musclebound                                          |0.9783009609842661|
|The Reaches                                          |0.9782770157834391|
|When Marian Sang: The True Recital of Marian Anderson|0.8793333615384179|
+------------------------

                                                                                