In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import ALS, Rating

In [3]:
def loadMovieNames():
    '''Map Movies Ids to Titles for ease of reading'''
    movieNames = {}
    with open("../SparkData/ml-100k/u.item") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

In [5]:
# using multiple cpu cores with local[*]
# ALS Recommendation system used for Netflix competition integrated into
#  Spark library
conf = SparkConf().setMaster("local[*]").setAppName("MovieRecommendationsALS")
sc = SparkContext(conf=conf)

In [6]:
nameDict = loadMovieNames()

In [7]:
data = sc.textFile("../../SparkData/ml-100k/u.data")

In [8]:
# use built in Rating class for our ratings in the data
# cache the data to make sure that it persists
ratings = data.map(lambda l: l.split()).map(lambda l: \
       Rating(int(l[0]), int(l[1]), float(l[2])) ).cache()

In [23]:
# Build recommendation System
rank = 10
numIterations = 20
model = ALS.train(ratings, rank, numIterations)

In [24]:
# fake user
# rated 'Star Wars' and 'Empire Strikes Back' with 5* rating
# rated 'Gone With the Wind' as a 1* rating
# should expect to see action types movies returned as recommendation
userID = 0

In [25]:
# only return ratings created by designated user with id 0
userRatings = ratings.filter(lambda l: l[0] == userID)

In [26]:
for rating in userRatings.collect():
    print("{}: {}".format(nameDict[int(rating[1])], str(rating[2])))

Star Wars (1977): 5.0
Empire Strikes Back, The (1980): 5.0
Gone with the Wind (1939): 1.0


In [27]:
recommendations = model.recommendProducts(userID, 10)

In [28]:
for r in recommendations:
    print("{} score {}".format( nameDict[int(r[1])], str(r[2]) ))

Faithful (1996) score 6.210060945564155
Lost in Space (1998) score 5.826830531948083
Harlem (1993) score 5.801021834001693
Bitter Moon (1992) score 5.695843085714616
Meet John Doe (1941) score 5.6166306455046096
Love in the Afternoon (1957) score 5.611342191235198
Chungking Express (1994) score 5.553973257984557
Alphaville (1965) score 5.376228333860043
Halloween: The Curse of Michael Myers (1995) score 5.352676818241392
Bhaji on the Beach (1993) score 5.275847486211784


The results are dubious at best. Further parameter tuning, training, and testing is necessary.