In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [112]:
spark = SparkSession.builder.appName("BookRecommendations").getOrCreate()

# Load User and Book Ratings data into PySpark DataFrames
user_df = spark.read.csv("BX-Users.csv", header=True, inferSchema=True, sep=';')
ratings_df = spark.read.csv("BX-Book-Ratings.csv", header=True, inferSchema=True, sep=';')
books_df = spark.read.csv("BX-Books.csv", header=True, inferSchema=True, sep=';')

In [105]:
books_df.show()

+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|      ISBN|          Book-Title|         Book-Author|Year-Of-Publication|           Publisher|         Image-URL-S|         Image-URL-M|         Image-URL-L|
+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|0195153448| Classical Mythology|  Mark P. O. Morford|               2002|Oxford University...|http://images.ama...|http://images.ama...|http://images.ama...|
|0002005018|        Clara Callan|Richard Bruce Wright|               2001|HarperFlamingo Ca...|http://images.ama...|http://images.ama...|http://images.ama...|
|0060973129|Decision in Normandy|        Carlo D'Este|               1991|     HarperPerennial|http://images.ama...|http://images.ama...|http://images.ama...|
|0374157065|Flu: The Story of...|    Gina Bari

In [113]:
usercol = ['userid', 'location', 'age']
user_df = user_df.toDF(*usercol)
user_df.show()

+------+--------------------+----+
|userid|            location| age|
+------+--------------------+----+
|     1|  nyc, new york, usa|NULL|
|     2|stockton, califor...|  18|
|     3|moscow, yukon ter...|NULL|
|     4|porto, v.n.gaia, ...|  17|
|     5|farnborough, hant...|NULL|
|     6|santa monica, cal...|  61|
|     7| washington, dc, usa|NULL|
|     8|timmins, ontario,...|NULL|
|     9|germantown, tenne...|NULL|
|    10|albacete, wiscons...|  26|
|    11|melbourne, victor...|  14|
|    12|fort bragg, calif...|NULL|
|    13|barcelona, barcel...|  26|
|    14|mediapolis, iowa,...|NULL|
|    15|calgary, alberta,...|NULL|
|    16|albuquerque, new ...|NULL|
|    17|chesapeake, virgi...|NULL|
|    18|rio de janeiro, r...|  25|
|    19|           weston, ,|  14|
|    20|langhorne, pennsy...|  19|
+------+--------------------+----+
only showing top 20 rows



In [114]:
bookscol = ['isbn', 'booktitle', 'bookauthor', 'yearofpublication', 'publisher', 'imageurls', 'imageurlm', 'imageurll']
books_df = books_df.toDF(*bookscol)
books_df.show()

+----------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|      isbn|           booktitle|          bookauthor|yearofpublication|           publisher|           imageurls|           imageurlm|           imageurll|
+----------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|0195153448| Classical Mythology|  Mark P. O. Morford|             2002|Oxford University...|http://images.ama...|http://images.ama...|http://images.ama...|
|0002005018|        Clara Callan|Richard Bruce Wright|             2001|HarperFlamingo Ca...|http://images.ama...|http://images.ama...|http://images.ama...|
|0060973129|Decision in Normandy|        Carlo D'Este|             1991|     HarperPerennial|http://images.ama...|http://images.ama...|http://images.ama...|
|0374157065|Flu: The Story of...|    Gina Bari Kolata|    

In [115]:
ratingscol = ['userid', 'isbn', 'rating']
ratings_df = ratings_df.toDF(*ratingscol)
ratings_df.show()

+------+----------+------+
|userid|      isbn|rating|
+------+----------+------+
|276725|034545104X|     0|
|276726|0155061224|     5|
|276727|0446520802|     0|
|276729|052165615X|     3|
|276729|0521795028|     6|
|276733|2080674722|     0|
|276736|3257224281|     8|
|276737|0600570967|     6|
|276744|038550120X|     7|
|276745| 342310538|    10|
|276746|0425115801|     0|
|276746|0449006522|     0|
|276746|0553561618|     0|
|276746|055356451X|     0|
|276746|0786013990|     0|
|276746|0786014512|     0|
|276747|0060517794|     9|
|276747|0451192001|     0|
|276747|0609801279|     0|
|276747|0671537458|     9|
+------+----------+------+
only showing top 20 rows



In [116]:
user_df = user_df.select(col("userid").cast("int"), col("location").cast("string"), col("age").cast("int"))

In [117]:
ratings_df = ratings_df.select(col("userid").cast("int"), col("isbn").cast("int"), col("rating").cast("double"))

In [118]:
user_df = user_df.na.drop()
user_df.show()

+------+--------------------+---+
|userid|            location|age|
+------+--------------------+---+
|     2|stockton, califor...| 18|
|     4|porto, v.n.gaia, ...| 17|
|     6|santa monica, cal...| 61|
|    10|albacete, wiscons...| 26|
|    11|melbourne, victor...| 14|
|    13|barcelona, barcel...| 26|
|    18|rio de janeiro, r...| 25|
|    19|           weston, ,| 14|
|    20|langhorne, pennsy...| 19|
|    21|ferrol / spain, a...| 46|
|    24|cologne, nrw, ger...| 19|
|    25|oakland, californ...| 55|
|    27|chicago, illinois...| 32|
|    28|freiburg, baden-w...| 24|
|    29|cuernavaca, alaba...| 19|
|    30|anchorage, alaska...| 24|
|    31|shanghai, n/a, china| 20|
|    33|costa mesa, calif...| 34|
|    35|grafton, wisconsi...| 17|
|    36|montreal, quebec,...| 24|
+------+--------------------+---+
only showing top 20 rows



In [119]:
ratings_df = ratings_df.na.drop()

In [120]:
books_df.createOrReplaceTempView('booksTable')

In [121]:
user_df.createOrReplaceTempView('userTable')

In [122]:
ratings_df.createOrReplaceTempView('ratingsTable')

In [127]:
long_table = user_df.join(ratings_df, 'userid', 'INNER')
long_table1 = long_table.join(books_df, 'isbn', 'INNER')

In [128]:
long_table1.show()

+-------+------+--------------------+---+------+--------------------+----------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|   isbn|userid|            location|age|rating|           booktitle|      bookauthor|yearofpublication|           publisher|           imageurls|           imageurlm|           imageurll|
+-------+------+--------------------+---+------+--------------------+----------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|1047973| 11944|santo tirso, port...| 21|   9.0|     Brave New World|   Aldous Huxley|             1999|Trafalgar Square ...|http://images.ama...|http://images.ama...|http://images.ama...|
|1048082|109901|sydney, new south...| 59|   8.0|     Made in America|     Bill Bryson|             1995|HarperCollins Pub...|http://images.ama...|http://images.ama...|http://images.ama...|
|1846086|122874|melbourne, south ...| 25|   0.0|There W

In [132]:
(training_data, test_data) = long_table1.randomSplit([0.8, 0.2], seed=1234)

In [133]:
from pyspark.ml.recommendation import ALS

# Create an ALS model
als = ALS(maxIter=5, rank=100, regParam=0.15, userCol="userid", itemCol="isbn", ratingCol="rating", coldStartStrategy = "drop")

# Fit the model to the ratings data
model = als.fit(training_data)
recommendations = model.transform(test_data)


In [134]:
recommendations.show()

+-------+------+--------------------+---+------+--------------------+----------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------+
|   isbn|userid|            location|age|rating|           booktitle|      bookauthor|yearofpublication|           publisher|           imageurls|           imageurlm|           imageurll| prediction|
+-------+------+--------------------+---+------+--------------------+----------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------+
|1939203|265595|palmerston north,...| 31|   0.0|The Collins Book ...|Jonathan Langley|             1993|HarperCollins Pub...|http://images.ama...|http://images.ama...|http://images.ama...|  0.5502374|
|2243962| 57725|wilmington, delaw...| 43|   0.0|Girlfriend In a Coma|Douglas Coupland|                0|Harper Collins Pu...|http://images.ama...|http://images.ama...|http://images.ama...|  0.3632

In [135]:
# Import RegressionEvaluator
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", 
predictionCol="prediction")

# Evaluate the "test_predictions" dataframe
RMSE = evaluator.evaluate(recommendations)

# Print the RMSE
print (RMSE)

3.9988915155055147


In [138]:
# Generate book recommendations for a specific user (e.g., user_id = 123)
user_id = 231263
user_books = long_table1.filter(ratings_df["userid"] == user_id)
user_unrated_books = long_table1.subtract(user_books)

# Predict ratings for unrated books
recommendations = model.transform(user_unrated_books)
top_recommendations = recommendations.orderBy("prediction", ascending=False).limit(10)

# Display the top recommended books
top_recommendations.show()

+---------+------+--------------------+---+------+--------------------+-------------------+-----------------+--------------------+--------------------+--------------------+--------------------+----------+
|     isbn|userid|            location|age|rating|           booktitle|         bookauthor|yearofpublication|           publisher|           imageurls|           imageurlm|           imageurll|prediction|
+---------+------+--------------------+---+------+--------------------+-------------------+-----------------+--------------------+--------------------+--------------------+--------------------+----------+
|931432820|106849|dunkirk, indiana,...| 39|  10.0|The Complete Book...|Valerie Ann Worwood|             1991|   New World Library|http://images.ama...|http://images.ama...|http://images.ama...|  12.35194|
|688167888|177375|w.m., pennsylvani...| 42|   8.0|Miss Julia Speaks...|        Ann B. Ross|             1999|William Morrow &a...|http://images.ama...|http://images.ama...|http://i

In [140]:
user_books.filter(user_unrated_books["isbn"] == 931432820).show()

+----+------+--------+---+------+---------+----------+-----------------+---------+---------+---------+---------+
|isbn|userid|location|age|rating|booktitle|bookauthor|yearofpublication|publisher|imageurls|imageurlm|imageurll|
+----+------+--------+---+------+---------+----------+-----------------+---------+---------+---------+---------+
+----+------+--------+---+------+---------+----------+-----------------+---------+---------+---------+---------+

