In [1]:
from pyspark.mllib.recommendation import ALS
from pyspark.mllib.recommendation import Rating

In [2]:
CORES_PER_NODE = 2
NUM_WORKERS = 4
REP_FACTOR = 4

In [3]:
# Read in the ratings file (fromUserId, toUserId, rating).  These ratings are 0-9.
ratings_raw_DF = sqlContext.read.format("com.databricks.spark.csv") \
                           .options(header="false") \
                           .load("s3n://insight-spark-after-dark/ratings.csv.gz") \
                           .repartition(CORES_PER_NODE*NUM_WORKERS*REP_FACTOR)\
                           .persist(StorageLevel.MEMORY_AND_DISK_SER)

In [4]:
# Register the ratings_raw DataFrame as a temp table
ratings_raw_DF.registerTempTable("ratings_raw_tbl")

In [5]:
ratings_raw_DF.count()

17359346

In [6]:
ratings_raw_DF.take(5)

[Row(C0=u'1', C1=u'4062', C2=u'3'),
 Row(C0=u'1', C1=u'19727', C2=u'10'),
 Row(C0=u'1', C1=u'37184', C2=u'9'),
 Row(C0=u'1', C1=u'51524', C2=u'6'),
 Row(C0=u'1', C1=u'71042', C2=u'5')]

In [7]:
# Cast the DataFrame to enforce a schema with (from_user_id, to_user_id, rating)
ratings_DF = sqlContext.sql("""
SELECT
    CAST(C0 as int) AS from_user_id,
    CAST(C1 as int) AS to_user_id,
    CAST(C2 as int) AS rating
FROM 
    ratings_raw_tbl
""")

In [8]:
# Create mllib.recommendation.Rating RDD from ratings DataFrame
ratings_RDD = ratings_DF.rdd.map(lambda r: Rating(r.from_user_id, r.to_user_id, r.rating))

ratings_RDD.take(5)

[Rating(user=1, product=3751, rating=7.0),
 Rating(user=1, product=19231, rating=5.0),
 Rating(user=1, product=36750, rating=2.0),
 Rating(user=1, product=51399, rating=7.0),
 Rating(user=1, product=70694, rating=8.0)]

In [9]:
# Separate ratings data into training data (80%) and test data (20%)
split_ratings_RDD = ratings_RDD.randomSplit([0.8, 0.2])
training_ratings_RDD = split_ratings_RDD[0]
test_ratings_RDD = split_ratings_RDD[1]

In [10]:
# Train the ALS model using the training data and various model hyperparameters
model = ALS.train(training_ratings_RDD, 1, 5, 0.01, 10)

In [11]:
# Convert known test data to have only (from, to)
test_from_to_RDD = test_ratings_RDD.map(lambda r: (r[0], r[1]))

In [12]:
# Test the model by predicting the ratings for the known test data
actual_predictions_RDD = model.predictAll(test_from_to_RDD)

actual_predictions_RDD.take(5)

[Rating(user=116685, product=193370, rating=3.9002635351928348),
 Rating(user=54499, product=193370, rating=2.984109066463475),
 Rating(user=58863, product=108150, rating=10.032405957363949),
 Rating(user=57163, product=108150, rating=9.413207908263985),
 Rating(user=107380, product=108150, rating=9.182127511317617)]

In [13]:
# Prepare the known test predictions and actual predictions for comparison keyed by (from, to)
actual_predictions_RDD = actual_predictions_RDD.map(lambda r: ((r[0], r[1]), r[2]))
test_predictions_RDD = test_ratings_RDD.map(lambda r: ((r[0], r[1]), r[2]))

In [14]:
# Join the known test predictions with the actual predictions
test_to_actual_ratings_RDD = test_predictions_RDD.join(actual_predictions_RDD)
test_to_actual_ratings_RDD.take(10)

[((10000, 117276), (5, 4.270460474679936)),
 ((60027, 168951), (4, 4.634894943158201)),
 ((121989, 99783), (1, 2.0501199552357434)),
 ((113819, 86977), (10, 7.762318729967319)),
 ((133732, 49038), (10, 8.392107996765276)),
 ((131976, 175554), (8, 8.440695135691385)),
 ((11556, 64056), (9, 9.293104485887511)),
 ((48324, 37330), (6, 8.723768284233415)),
 ((8305, 101387), (10, 10.20807837129098)),
 ((33662, 45352), (5, 5.333782872208303))]

In [15]:
# Evaluate the model using Mean Absolute Error (MAE) between the known test ratings and the actual predictions 
mean_absolute_rating_error = test_to_actual_ratings_RDD.map(lambda r: abs(r[1][0]-r[1][1]))\
                                                       .mean()

print mean_absolute_rating_error

1.91554240348
