In [18]:
import pyspark
from pyspark.sql import SparkSession

In [19]:
spark = SparkSession.builder \
    .appName("AlsModel") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()


In [20]:
df = spark.read.parquet("/home/vaibhavi/spark-ml-venv/ml_project/preprocessing/output/als/*")

In [21]:
df.head()

Row(User_id='A01254073JW8SSTKH6AIB', Id='0451521196', rating=5.0)

In [22]:
top_users = df.groupBy("User_id").count().orderBy("count", ascending = False ).limit(100)

In [23]:
top_users = top_users.select("User_id")

In [24]:
top_users_ratings = top_users.join(df, on="User_id", how = "inner")

In [25]:
top_users_ratings.printSchema()

root
 |-- User_id: string (nullable = true)
 |-- Id: string (nullable = true)
 |-- rating: double (nullable = true)



In [26]:
top_users_ratings.count()

65338

In [27]:
from pyspark.sql.functions import col, sum as Fsum


top_users_ratings.select([Fsum(col(c).isNull().cast("int")).alias(c) for c in top_users_ratings.columns]).show()


+-------+---+------+
|User_id| Id|rating|
+-------+---+------+
|      0|  0|     0|
+-------+---+------+



In [28]:
from pyspark.sql.functions import col, sum

# For each column, count nulls
null_counts = top_users_ratings.select([sum(col(c).isNull().cast("int")).alias(c) for c in top_users_ratings.columns])

null_counts.show()


+-------+---+------+
|User_id| Id|rating|
+-------+---+------+
|      0|  0|     0|
+-------+---+------+



In [29]:
top_users_ratings.show()

+--------------+----------+------+
|       User_id|        Id|rating|
+--------------+----------+------+
|A12A08OL0TZY0W|1419191985|   3.0|
|A12A08OL0TZY0W|1557424365|   4.0|
|A12A08OL0TZY0W|B00086F8U8|   5.0|
|A12A08OL0TZY0W|B000KDJ5YY|   5.0|
|A12A08OL0TZY0W|B000L9O7D6|   5.0|
|A12A08OL0TZY0W|B000MOO1J0|   5.0|
|A13G1TKIKHGV3F|0394770722|   5.0|
|A13G1TKIKHGV3F|0698113586|   5.0|
|A13G1TKIKHGV3F|B0006AVIJ6|   5.0|
|A14OJS0VWMOSWO|0275981428|   5.0|
|A14OJS0VWMOSWO|0345283937|   5.0|
|A14OJS0VWMOSWO|0385245467|   5.0|
|A14OJS0VWMOSWO|0393060985|   5.0|
|A14OJS0VWMOSWO|0553344455|   5.0|
|A14OJS0VWMOSWO|0711963150|   5.0|
|A14OJS0VWMOSWO|0762726024|   5.0|
|A14OJS0VWMOSWO|0792266862|   5.0|
|A14OJS0VWMOSWO|0826604161|   5.0|
|A14OJS0VWMOSWO|0864922302|   5.0|
|A14OJS0VWMOSWO|0881505587|   5.0|
+--------------+----------+------+
only showing top 20 rows



In [30]:
from pyspark.ml.feature import StringIndexer

# StringIndexer for user_id


user_indexer = StringIndexer(inputCol="User_id", outputCol="user")
user_indexer_fitted = user_indexer.fit(top_users_ratings)
user_indexer_fitted.write().overwrite().save("models/user_indexer_fitted")


book_indexer = StringIndexer(inputCol="Id", outputCol="book_id")
book_indexer_fitted = book_indexer.fit(top_users_ratings)
book_indexer_fitted.write().overwrite().save("models/book_indexer_fitted")



top_users_ratings = user_indexer_fitted.transform(top_users_ratings)
top_users_ratings = book_indexer_fitted.transform(top_users_ratings)


In [31]:
top_users_ratings.printSchema()

root
 |-- User_id: string (nullable = true)
 |-- Id: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- user: double (nullable = false)
 |-- book_id: double (nullable = false)



In [32]:
top_users_ratings.show()

+--------------+----------+------+----+-------+
|       User_id|        Id|rating|user|book_id|
+--------------+----------+------+----+-------+
|A12A08OL0TZY0W|1419191985|   3.0|41.0| 1932.0|
|A12A08OL0TZY0W|1557424365|   4.0|41.0| 7062.0|
|A12A08OL0TZY0W|B00086F8U8|   5.0|41.0| 8408.0|
|A12A08OL0TZY0W|B000KDJ5YY|   5.0|41.0| 4837.0|
|A12A08OL0TZY0W|B000L9O7D6|   5.0|41.0|  263.0|
|A12A08OL0TZY0W|B000MOO1J0|   5.0|41.0|  265.0|
|A13G1TKIKHGV3F|0394770722|   5.0|68.0| 1823.0|
|A13G1TKIKHGV3F|0698113586|   5.0|68.0|15464.0|
|A13G1TKIKHGV3F|B0006AVIJ6|   5.0|68.0| 7834.0|
|A14OJS0VWMOSWO|0275981428|   5.0| 0.0|11357.0|
|A14OJS0VWMOSWO|0345283937|   5.0| 0.0|11963.0|
|A14OJS0VWMOSWO|0385245467|   5.0| 0.0|12497.0|
|A14OJS0VWMOSWO|0393060985|   5.0| 0.0| 5670.0|
|A14OJS0VWMOSWO|0553344455|   5.0| 0.0|14228.0|
|A14OJS0VWMOSWO|0711963150|   5.0| 0.0|15531.0|
|A14OJS0VWMOSWO|0762726024|   5.0| 0.0|16420.0|
|A14OJS0VWMOSWO|0792266862|   5.0| 0.0| 6606.0|
|A14OJS0VWMOSWO|0826604161|   5.0| 0.0|1

25/07/22 23:44:05 WARN DAGScheduler: Broadcasting large task binary with size 1201.4 KiB


In [33]:
top_users_ratings.write.mode("overwrite").parquet("data/top_users_ratings.parquet")

25/07/22 23:44:13 WARN DAGScheduler: Broadcasting large task binary with size 1847.3 KiB


In [None]:

def get_als_recommendations(user_id, spark):

    user_indexer = StringIndexerModel.load("models/user_indexer_fitted")
    book_indexer = StringIndexerModel.load("models/book_indexer_fitted")

    als_model = ALSModel.load("models/als_model")
    
    # Make a tiny DataFrame to transform user_id
    user_df = spark.createDataFrame([(user_id,)], ["User_id"])

    try:
        transformed_user = user_indexer.transform(user_df).select("user")
        user_idx = transformed_user.collect()[0]["user"]

        recs = als_model.recommendForUserSubset(
            transformed_user.selectExpr("user_idx as user"), numItems=10
        )
        return recs
    except Exception:
        return None  # Not in training, let hybrid fallback to content
