In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import sys
import pyspark
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer
from pyspark.sql import SparkSession
from pyspark.mllib.util import Saveable
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType

from recommenders.utils.timer import Timer
from recommenders.utils.notebook_utils import is_jupyter
from recommenders.datasets.spark_splitters import spark_random_split
from recommenders.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation
from recommenders.utils.spark_utils import start_or_get_spark
from recommenders.utils.notebook_utils import store_metadata

print(f"System version: {sys.version}")
print("Spark version: {}".format(pyspark.__version__))


System version: 3.11.6 | packaged by conda-forge | (main, Oct  3 2023, 10:40:35) [GCC 12.3.0]
Spark version: 3.5.0


In [19]:
sc = spark.sparkContext
sc.setCheckpointDir('checkpoint')

In [4]:
# top k items to recommend
TOP_K = 10

# Column names for the dataset
COL_USER = "User-ID"
COL_ITEM = "Item-ID"
COL_RATING = "Book-Rating"

In [2]:
# the following settings work well for debugging locally on VM - change when running on a cluster
# set up a giant single executor with many threads and specify memory cap
spark = start_or_get_spark("ALS PySpark", memory="6g")
spark.conf.set("spark.sql.analyzer.failAmbiguousSelfJoin", "false")

In [3]:
df = spark.read.csv('./data/cleaned_ratings.csv', inferSchema=True, header=True)
df.show(10)

+---+-------+----------+-----------+
|_c0|User-ID|      ISBN|Book-Rating|
+---+-------+----------+-----------+
|  1| 276726|0155061224|          5|
|  3| 276729|052165615X|          3|
|  4| 276729|0521795028|          6|
|  6| 276736|3257224281|          8|
|  7| 276737|0600570967|          6|
|  8| 276744|038550120X|          7|
|  9| 276745| 342310538|         10|
| 16| 276747|0060517794|          9|
| 19| 276747|0671537458|          9|
| 20| 276747|0679776818|          8|
+---+-------+----------+-----------+
only showing top 10 rows



In [13]:
indexer = StringIndexer(inputCol="ISBN", outputCol="Item-ID")
data = indexer.fit(df).transform(df)

In [14]:
data.show(10)

+---+-------+----------+-----------+--------+
|_c0|User-ID|      ISBN|Book-Rating| Item-ID|
+---+-------+----------+-----------+--------+
|  1| 276726|0155061224|          5| 67111.0|
|  3| 276729|052165615X|          3| 98858.0|
|  4| 276729|0521795028|          6| 98875.0|
|  6| 276736|3257224281|          8| 19512.0|
|  7| 276737|0600570967|          6|105804.0|
|  8| 276744|038550120X|          7|   217.0|
|  9| 276745| 342310538|         10|163094.0|
| 16| 276747|0060517794|          9|  1085.0|
| 19| 276747|0671537458|          9|  2662.0|
| 20| 276747|0679776818|          8|  1996.0|
+---+-------+----------+-----------+--------+
only showing top 10 rows



# Split data

In [15]:
train, test = spark_random_split(data, ratio=0.75, seed=123)
print ("N train", train.cache().count())
print ("N test", test.cache().count())

N train 325373
N test 108298


## Train the ALS model on the training data, get the top-k recommendations for our testing data

In [16]:
header = {
    "userCol": COL_USER,
    "itemCol": COL_ITEM,
    "ratingCol": COL_RATING,
}


als = ALS(
    rank=10,
    maxIter=15,
    implicitPrefs=False,
    regParam=0.05,
    coldStartStrategy='drop',
    nonnegative=False,
    seed=42,
    **header
)

In [17]:
with Timer() as train_time:
    model = als.fit(train)

print("Took {} seconds for training.".format(train_time.interval))

Took 117.13044310000078 seconds for training.


In [None]:
with Timer() as test_time:

    # Get the cross join of all user-item pairs and score them.
    users = train.select(COL_USER).distinct()
    items = train.select(COL_ITEM).distinct()
    user_item = users.crossJoin(items)
    dfs_pred = model.transform(user_item)

    # Remove seen items.
    dfs_pred_exclude_train = dfs_pred.alias("pred").join(
        train.alias("train"),
        (dfs_pred[COL_USER] == train[COL_USER]) & (dfs_pred[COL_ITEM] == train[COL_ITEM]),
        how='outer'
    )

    top_all = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[f"train.{COL_RATING}"].isNull()) \
        .select('pred.' + COL_USER, 'pred.' + COL_ITEM, 'pred.' + "prediction")

    # In Spark, transformations are lazy evaluation
    # Use an action to force execute and measure the test time 
    top_all.cache().count()

print("Took {} seconds for prediction.".format(test_time.interval))