# Collaborative Filtering Classification Example.

In [1]:
import findspark
findspark.init('/Users/Zoe/spark-2.1.0-bin-hadoop2.7/')
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

## ALS example

In [2]:
from __future__ import print_function
from pyspark.sql import SparkSession

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [79]:
spark = SparkSession\
        .builder\
        .appName("ALSExample")\
        .getOrCreate()

In [80]:
lines = spark.read.text("/Users/Zoe/spark-2.1.0-bin-hadoop2.7/data/mllib/als/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]), rating=float(p[2]), timestamp=long(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

In [86]:
training.count()

1189

In [85]:
# Build the recommendation model using ALS on the training data
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")
model = als.fit(training)

In [87]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.80291727044


In [88]:
spark.stop()

In [20]:
# Save and load model
#model.save(sc, "target/tmp/myCollaborativeFilter")
#sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")

## ML Recommendation System Example

In [5]:
from pyspark.ml.recommendation import ALS

In [6]:
df = spark.createDataFrame([(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)],["user", "item", "rating"])
als = ALS(rank=10, maxIter=5, seed=0)
model = als.fit(df)
model.rank

10

In [11]:
df.collect()

[Row(user=0, item=0, rating=4.0),
 Row(user=0, item=1, rating=2.0),
 Row(user=1, item=1, rating=3.0),
 Row(user=1, item=2, rating=4.0),
 Row(user=2, item=1, rating=1.0),
 Row(user=2, item=2, rating=5.0)]

In [12]:
model.userFactors.orderBy("id").collect()

[Row(id=0, features=[0.43850013613700867, -0.325748473405838, 0.22142787277698517, 0.5432963371276855, 0.37189164757728577, 0.18800288438796997, 0.7535192966461182, -0.1935736984014511, 0.6309714913368225, -0.09739827364683151]),
 Row(id=1, features=[0.3618026077747345, -0.7803052663803101, 0.4669971466064453, -0.005516086705029011, 0.47553545236587524, -0.2531031668186188, 0.10632749646902084, -0.30544209480285645, 0.8610320687294006, 0.38398146629333496]),
 Row(id=2, features=[-0.10391315817832947, -0.5550527572631836, 0.30898740887641907, -0.640823483467102, 0.11988699436187744, -0.533919095993042, -0.7560914754867554, -0.13308270275592804, 0.2660013437271118, 0.569276750087738])]

In [13]:
test = spark.createDataFrame([(0, 2), (1, 0), (2, 0)], ["user", "item"])

In [14]:
predictions = sorted(model.transform(test).collect(), key=lambda r: r[0])
predictions[0]

Row(user=0, item=2, prediction=-0.13807615637779236)

In [15]:
predictions[1]

Row(user=1, item=0, prediction=2.6258413791656494)

In [16]:
predictions[2]

Row(user=2, item=0, prediction=-1.5018409490585327)

In [17]:
#als_path = temp_path + "/als"
#als.save(als_path)
#als2 = ALS.load(als_path)
#als.getMaxIter()

#model_path = temp_path + "/als_model"
#model.save(model_path)
#model2 = ALSModel.load(model_path)
#model.rank == model2.rank
#sorted(model.userFactors.collect()) == sorted(model2.userFactors.collect())
#sorted(model.itemFactors.collect()) == sorted(model2.itemFactors.collect())

## ALS Naive Implementation

In [3]:
from __future__ import print_function

import sys

import numpy as np
from numpy.random import rand
from numpy import matrix
from pyspark.sql import SparkSession

In [4]:
LAMBDA = 0.01   # regularization
np.random.seed(42)

In [5]:
def rmse(R, ms, us):
    diff = R - ms * us.T
    return np.sqrt(np.sum(np.power(diff, 2)) / (M * U))

In [6]:
def update(i, mat, ratings):
    #uu = mat.shape[0]
    ff = mat.shape[1]
    nz = np.nonzero(ratings[i,:])[1]
    m = mat[nz,:]

    XtX = m.T * m
    Xty = m.T * ratings[i, nz].T

    #for j in range(ff):
    #    XtX[j, j] += LAMBDA * m.shape[0]
    XtX += LAMBDA*(m.shape[0])*np.eye(ff)
    

    return np.linalg.solve(XtX, Xty)

In [9]:
%timeit update(0,usb.value,Rb.value)

The slowest run took 128.10 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 83.1 µs per loop


In [73]:
#spark = SparkSession.builder.appName("PythonALS").getOrCreate()

In [7]:
M = 10
U = 10
F = 5
ITERATIONS = 5
partitions = 2

In [13]:
R = matrix(rand(M, F)) * matrix(rand(U, F).T)
mask = np.random.randint(0,2,size=R.shape).astype(np.bool)
R[mask] = 0
#R = matrix([[4,2,0],[0,3,4],[0,1,5]])
ms = matrix(rand(M, F))
us = matrix(rand(U, F))
th = matrix(rand(U, F))

Rb = sc.broadcast(R)
msb = sc.broadcast(ms)
usb = sc.broadcast(us)
thb = sc.broadcast(th)

In [10]:
for i in range(ITERATIONS):
    ms = sc.parallelize(range(M), partitions) \
            .map(lambda x: update(x, usb.value, Rb.value)) \
            .collect()
    # collect() returns a list, so array ends up being
    # a 3-d array, we take the first 2 dims for the matrix
    ms = matrix(np.array(ms)[:, :, 0])
    msb = sc.broadcast(ms)

    us = sc.parallelize(range(U), partitions) \
            .map(lambda x: update(x, msb.value, Rb.value.T, False)) \
            .collect()
    us = matrix(np.array(us)[:, :, 0])
    usb = sc.broadcast(us)
    
    nz = np.nonzero(R)
    error = rmse(R[nz], ms[nz[0],:], us[nz[1],:])
    print("Iteration %d:" % i)
    print("\nRMSE: %5.4f\n" % error)

Iteration 0:

RMSE: 3.2983

Iteration 1:

RMSE: 2.3184

Iteration 2:

RMSE: 2.1013

Iteration 3:

RMSE: 2.0110

Iteration 4:

RMSE: 1.9494



In [11]:
ms.dot(us.T) # not what I want, I don't want it loop over zero-elements in R

matrix([[ 2.40935805,  0.61602627,  1.24770649,  1.12782777,  1.42015165,
          1.26728724,  1.08850637,  1.86017989,  1.71087455,  1.01136333],
        [ 1.95560764,  0.59406384,  1.18235118,  1.47216144,  1.12387277,
          0.53575459,  0.57923864,  1.49748901,  0.88769525,  0.56074872],
        [ 1.83243503,  0.36748001,  0.87271494,  0.52867683,  1.05352335,
          1.18983318,  0.9345224 ,  1.41769445,  1.5427175 ,  0.88040946],
        [ 1.26842432,  0.45932612,  0.55839607,  0.51891521,  0.75017009,
          0.80583287,  0.7134516 ,  0.8977061 ,  1.06648775,  0.64703118],
        [ 1.32007282,  0.79086562,  0.5622812 ,  1.03258311,  1.10214332,
          0.74194069,  0.85584952,  1.01077746,  0.93868099,  0.53669571],
        [ 1.48133598,  0.84763592,  0.57905369,  0.94332785,  1.16607932,
          0.9681665 ,  1.02214744,  1.09005295,  1.21684527,  0.70784743],
        [ 2.21122237,  0.9159136 ,  0.95736762,  1.43312356,  1.9416243 ,
          1.21044474,  1.2423883

In [12]:
R

matrix([[ 2.41676527,  0.        ,  1.26758591,  1.1230466 ,  0.        ,
          1.2931767 ,  0.        ,  0.        ,  0.        ,  1.02692373],
        [ 1.97129471,  0.57624726,  1.20759692,  1.53128798,  1.12748361,
          0.52017784,  0.        ,  1.51302139,  0.        ,  0.56939223],
        [ 1.85256634,  0.33670726,  0.        ,  0.        ,  0.        ,
          0.        ,  0.96780344,  0.        ,  1.58691703,  0.89808646],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  1.10512916,  0.        ],
        [ 0.        ,  0.79859388,  0.        ,  1.06346781,  1.12296344,
          0.        ,  0.87682374,  1.02740289,  0.95217217,  0.55847075],
        [ 0.        ,  0.88418929,  0.        ,  0.9567743 ,  1.16479105,
          0.98222017,  1.02440167,  0.        ,  0.        ,  0.73201905],
        [ 2.208801  ,  0.        ,  0.94611857,  1.42963705,  1.95976202,
          0.        ,  0.       

In [26]:
#spark.stop()