# A Movie Recommendation Engine

### Source:  https://www.codementor.io/@jadianes/building-a-recommender-with-apache-spark-python-example-app-part1-du1083qbw

### Create a SpartContext configured for local mode

In [2]:
import pyspark
sc = pyspark.SparkContext('local[*]')

### File download


Small: 100,000 ratings and 2,488 tag applications applied to 8,570 movies by 706 users. Last updated 4/2015.  
Full: 21,000,000 ratings and 470,000 tag applications applied to 27,000 movies by 230,000 users. Last updated 4/2015.

In [11]:
complete_dataset_url = 'http://files.grouplens.org/datasets/movielens/ml-latest.zip'
small_dataset_url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'

### Download locations

In [12]:
import os

datasets_path = os.path.join('..', 'datasets')

complete_dataset_path = os.path.join(datasets_path, 'ml-latest.zip')
small_dataset_path = os.path.join(datasets_path, 'ml-latest-small.zip')

### Getting Files

In [15]:
import urllib.request

small_f = urllib.request.urlretrieve (small_dataset_url, small_dataset_path)
complete_f = urllib.request.urlretrieve (complete_dataset_url, complete_dataset_path)

### Extracting Files

In [16]:
import zipfile

with zipfile.ZipFile(small_dataset_path, "r") as z:
    z.extractall(datasets_path)

with zipfile.ZipFile(complete_dataset_path, "r") as z:
    z.extractall(datasets_path)

### Loading and Parsing datasests

In [18]:
small_ratings_file = os.path.join(datasets_path, 'ml-latest-small', 'ratings.csv')

small_ratings_raw_data = sc.textFile(small_ratings_file)
small_ratings_raw_data_header = small_ratings_raw_data.take(1)[0]

In [19]:
small_ratings_data = small_ratings_raw_data.filter(lambda line: line!=small_ratings_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1],tokens[2])).cache()

In [20]:
small_ratings_data.take(3)


[('1', '1', '4.0'), ('1', '3', '4.0'), ('1', '6', '4.0')]

In [21]:
small_movies_file = os.path.join(datasets_path, 'ml-latest-small', 'movies.csv')

small_movies_raw_data = sc.textFile(small_movies_file)
small_movies_raw_data_header = small_movies_raw_data.take(1)[0]

small_movies_data = small_movies_raw_data.filter(lambda line: line!=small_movies_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1])).cache()
    
small_movies_data.take(3)

[('1', 'Toy Story (1995)'),
 ('2', 'Jumanji (1995)'),
 ('3', 'Grumpier Old Men (1995)')]

## Collaborative Filtering

### Selecting ALS parameters using the small dataset

In [23]:
training_RDD, validation_RDD, test_RDD = small_ratings_data.randomSplit([6, 2, 2], seed=0)
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

### Training Phase

In [25]:
from pyspark.mllib.recommendation import ALS
import math

seed = 5
iterations = 10
regularization_parameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.02

min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    model = ALS.train(training_RDD, rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
    predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = error
    err += 1
    print ('For rank %s the RMSE is %s' % (rank, error))
    if error < min_error:
        min_error = error
        best_rank = rank

print ('The best model was trained with rank %s' % best_rank)

For rank 4 the RMSE is 0.908078105265682
For rank 8 the RMSE is 0.916462973348527
For rank 12 the RMSE is 0.917665030756129
The best model was trained with rank 4


In [26]:
predictions.take(3)


[((372, 1084), 3.42419871162954),
 ((4, 1084), 3.866749726695713),
 ((402, 1084), 3.4099577968422152)]

In [27]:
rates_and_preds.take(3)


[((1, 457), (5.0, 4.381060760461434)),
 ((1, 1025), (5.0, 4.705295366590298)),
 ((1, 1089), (5.0, 4.979982471805129))]

In [29]:
model = ALS.train(training_RDD, best_rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
predictions = model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    
print ('For testing data the RMSE is %s' % (error))

For testing data the RMSE is 0.9113780946334407


### Using the complete dataset to build the final model

In [30]:
# Load the complete dataset file
complete_ratings_file = os.path.join(datasets_path, 'ml-latest', 'ratings.csv')
complete_ratings_raw_data = sc.textFile(complete_ratings_file)
complete_ratings_raw_data_header = complete_ratings_raw_data.take(1)[0]

# Parse
complete_ratings_data = complete_ratings_raw_data.filter(lambda line: line!=complete_ratings_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]),int(tokens[1]),float(tokens[2]))).cache()
    
print ("There are %s recommendations in the complete dataset" % (complete_ratings_data.count()))

There are 33832162 recommendations in the complete dataset


In [31]:
training_RDD, test_RDD = complete_ratings_data.randomSplit([7, 3], seed=0)

complete_model = ALS.train(training_RDD, best_rank, seed=seed, 
                           iterations=iterations, lambda_=regularization_parameter)

In [32]:
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

predictions = complete_model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    
print ('For testing data the RMSE is %s' % (error))

For testing data the RMSE is 0.8257054095972955


### Making Recommendations

In [34]:
complete_movies_file = os.path.join(datasets_path, 'ml-latest', 'movies.csv')
complete_movies_raw_data = sc.textFile(complete_movies_file)
complete_movies_raw_data_header = complete_movies_raw_data.take(1)[0]

# Parse
complete_movies_data = complete_movies_raw_data.filter(lambda line: line!=complete_movies_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]),tokens[1],tokens[2])).cache()

complete_movies_titles = complete_movies_data.map(lambda x: (int(x[0]),x[1]))
    
print ("There are %s movies in the complete dataset" % (complete_movies_titles.count()))


There are 86537 movies in the complete dataset


#### Count number of ratings per movie

In [35]:
def get_counts_and_averages(ID_and_ratings_tuple):
    nratings = len(ID_and_ratings_tuple[1])
    return ID_and_ratings_tuple[0], (nratings, float(sum(x for x in ID_and_ratings_tuple[1]))/nratings)

movie_ID_with_ratings_RDD = (complete_ratings_data.map(lambda x: (x[1], x[2])).groupByKey())
movie_ID_with_avg_ratings_RDD = movie_ID_with_ratings_RDD.map(get_counts_and_averages)
movie_rating_counts_RDD = movie_ID_with_avg_ratings_RDD.map(lambda x: (x[0], x[1][0]))

### Adding new user ratings

In [63]:
new_user_ID = 0

# The format of each line is (userID, movieID, rating)
# new_user_ratings = [
#      (0,260,4), # Star Wars (1977)
#      (0,1,3), # Toy Story (1995)
#      (0,16,3), # Casino (1995)
#      (0,25,4), # Leaving Las Vegas (1995)
#      (0,32,4), # Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
#      (0,335,1), # Flintstones, The (1994)
#      (0,379,1), # Timecop (1994)
#      (0,296,3), # Pulp Fiction (1994)
#      (0,858,5) , # Godfather, The (1972)
#      (0,50,4) # Usual Suspects, The (1995)
#     ]

# Scenario 1 
# new_user_ratings = [
#      (0,260,5), # Star Wars (1977)
#      (0,1036,3), # Die Hard (1988)
#      (0,1040,3), # Secret Agent, The (1996)
#      (0,1197,2), # Princess Bride, The (1987)
#      (0,1221,3), # Godfather: Part II, The (1974)
#      (0,1240,4), # Terminator, The (1984)
#      (0,1293,5), # Gandhi (1982)
#      (0,1307,3), # When Harry Met Sally... (1989)
#      (0,1371,5) , # Star Trek: The Motion Picture (1979)
#      (0,1499,4) # Anaconda (1997)
#     ]

# Scenario 2
new_user_ratings = [
     (0,1721,4), # Titanic (1997)
     (0,1580,4), # Men in Black (a.k.a. MIB) (1997)
     (0,3593,1), # Battlefield Earth (2000)
     (0,4896,5), # Harry Potter and the Sorcerer's Stone
     (0,209061,5), # Sherlock Holmes: The Last Vampyre (1993)
     (0,109487,4), # Interstellar (2014)
     (0,588,3), # Aladdin (1992)
     (0,586,3), # Home Alone (1990)
     (0,74530,5) , # Percy Jackson & the Olympians
     (0,76056,2) # New York (2009)
    ]

new_user_ratings_RDD = sc.parallelize(new_user_ratings)
print ('New user ratings: %s' % new_user_ratings_RDD.take(10))

New user ratings: [(0, 1721, 4), (0, 1580, 4), (0, 3593, 1), (0, 4896, 5), (0, 209061, 5), (0, 109487, 4), (0, 588, 3), (0, 586, 3), (0, 74530, 5), (0, 76056, 2)]


In [64]:
complete_data_with_new_ratings_RDD = complete_ratings_data.union(new_user_ratings_RDD)


In [65]:
from time import time

t0 = time()
new_ratings_model = ALS.train(complete_data_with_new_ratings_RDD, best_rank, seed=seed, 
                              iterations=iterations, lambda_=regularization_parameter)
tt = time() - t0

print ("New model trained in %s seconds" % round(tt,3))

New model trained in 270.627 seconds


### Getting Top recommendations

In [66]:
new_user_ratings_ids = map(lambda x: x[1], new_user_ratings) # get just movie IDs
# keep just those not on the ID list (thanks Lei Li for spotting the error!)
new_user_unrated_movies_RDD = (complete_movies_data.filter(lambda x: x[0] not in new_user_ratings_ids).map(lambda x: (new_user_ID, x[0])))

# Use the input RDD, new_user_unrated_movies_RDD, with new_ratings_model.predictAll() to predict new ratings for the movies
new_user_recommendations_RDD = new_ratings_model.predictAll(new_user_unrated_movies_RDD)

In [67]:
# Transform new_user_recommendations_RDD into pairs of the form (Movie ID, Predicted Rating)
new_user_recommendations_rating_RDD = new_user_recommendations_RDD.map(lambda x: (x.product, x.rating))
new_user_recommendations_rating_title_and_count_RDD = \
    new_user_recommendations_rating_RDD.join(complete_movies_titles).join(movie_rating_counts_RDD)
new_user_recommendations_rating_title_and_count_RDD.take(3)

[(257805, ((3.2340933988270777, 'Best Sellers (2021)'), 32)),
 (154530, ((0.2071116570099416, 'Recto / verso'), 2)),
 (71910, ((3.020696128450151, '"Tournament'), 268))]

In [68]:
new_user_recommendations_rating_title_and_count_RDD = \
    new_user_recommendations_rating_title_and_count_RDD.map(lambda r: (r[1][0][1], r[1][0][0], r[1][1]))

### Filtering out movies with less than 25 ratings

In [69]:
top_movies = new_user_recommendations_rating_title_and_count_RDD.filter(lambda r: r[2]>=25).takeOrdered(15, key=lambda x: -x[1])

print ('TOP recommended movies (with more than 25 reviews):\n%s' %
        '\n'.join(map(str, top_movies)))

TOP recommended movies (with more than 25 reviews):
('Pollyanna (2003)', 4.805593947106426, 33)
('The Bible (2013)', 4.7670745731646385, 31)
('Shaadi Mein Zaroor Aana (2017)', 4.700950824647783, 26)
('Love is God (2003)', 4.618226139505859, 27)
('Den radio (2001)', 4.607191996406179, 38)
('Rent: Filmed Live on Broadway (2008)', 4.602257329767031, 41)
('Border (1997)', 4.595495226646453, 38)
('The Butterfly Circus (2009)', 4.573118339696792, 44)
('North & South (2004)', 4.543987827272972, 485)
('Us Again (2021)', 4.530712454637214, 37)
('Iron Jawed Angels (2004)', 4.521868949143901, 50)
('I Can Only Imagine (2018)', 4.508390072967761, 81)
('Mower Minions (2016)', 4.508144195710038, 42)
('Trevor Noah: Son of Patricia (2018)', 4.506821801933262, 69)
('Drishyam 2 (2021)', 4.5052843366990825, 34)


### Filtering out movies with less than 100 ratings


In [70]:
top_movies = new_user_recommendations_rating_title_and_count_RDD.filter(lambda r: r[2]>=100).takeOrdered(15, key=lambda x: -x[1])

print ('TOP recommended movies (with more than 100 reviews):\n%s' %
        '\n'.join(map(str, top_movies)))

TOP recommended movies (with more than 100 reviews):
('North & South (2004)', 4.543987827272972, 485)
('Pride and Prejudice (1995)', 4.484005894475069, 3229)
('War Room (2015)', 4.458380950141752, 108)
('The Biggest Little Farm (2018)', 4.451077478196729, 121)
('Gifted Hands: The Ben Carson Story (2009)', 4.443992178835147, 153)
('Hamilton (2020)', 4.427148150887166, 2094)
('Sense & Sensibility (2008)', 4.408884334165066, 101)
('Hidden Figures (2016)', 4.40304559228319, 5878)
('Harry Potter and the Deathly Hallows: Part 2 (2011)', 4.361168749350742, 20837)
('The Hundred-Foot Journey (2014)', 4.352469671627779, 959)
('Spider-Man: Across the Spider-Verse (2023)', 4.344597143128013, 528)
('Piper (2016)', 4.3297060915042085, 2007)
('Fireproof (2008)', 4.327696953171022, 264)
('Harry Potter and the Deathly Hallows: Part 1 (2010)', 4.325495693359052, 21781)
('"Sound of Music', 4.32045234753871, 19092)


### Getting individual ratings

In [45]:
my_movie = sc.parallelize([(0, 500)]) # Quiz Show (1994)
individual_movie_rating_RDD = new_ratings_model.predictAll(new_user_unrated_movies_RDD)
individual_movie_rating_RDD.take(1)

[Rating(user=0, product=206085, rating=2.4763291156674185)]

### User 1 - Scenario 1

TOP recommended movies (with more than 25 reviews):
('Detective Conan: The Last Wizard of the Century (1999)', 4.633993246282561, 34)
('Doctor Who: The Waters of Mars (2009)', 4.458570909746342, 44)
('Pandora (2016)', 4.4247919138691145, 46)
('Detective Conan: The Fourteenth Target (1998)', 4.3731528473059065, 32)
('Doctor Who: Voyage of the Damned (2007)', 4.356283708898584, 29)
('Louis C.K.: Sorry (2021)', 4.338982219498405, 39)
("Sharpe's Sword (1995)", 4.3299668217185605, 27)
('One Piece Film: GOLD (2016)', 4.326239213581131, 42)
('Avengers: Infinity War - Part II (2019)', 4.314938214509443, 12845)
('Avengers: Infinity War - Part I (2018)', 4.313439277912256, 16164)
("KonoSuba: God's Blessing on this Wonderful World! Legend of Crimson (2019)", 4.3104578938773095, 51)
('What a Beautiful Day (2011)', 4.285629586377734, 31)
('Like Minds (2006)', 4.274245695172404, 35)
('Pope Joan (Die Päpstin) (2009)', 4.267095285167905, 39)
('Indictment: The McMartin Trial (1995)', 4.266934955634852, 29)

### User 1 - Scenario 2


TOP recommended movies (with more than 100 reviews):
('Avengers: Infinity War - Part II (2019)', 4.314938214509443, 12845)
('Avengers: Infinity War - Part I (2018)', 4.313439277912256, 16164)
('"Avengers', 4.265181848406174, 27495)
('Iron Man (2008)', 4.259699045249739, 38308)
('The Lost Room (2006)', 4.2458191200893545, 445)
('The Matrix Revisited (2001)', 4.243257013533455, 160)
('"Dark Knight', 4.241962245395641, 65349)
('Death Note: R2 - L o Tsugu Mono (2008)', 4.238266855082774, 104)
('"Dark Knight Rises', 4.230359727598167, 31704)
('Gladiator (2000)', 4.229830562371291, 60749)
('"Rock', 4.227283772666121, 40412)
('Gladiator (1992)', 4.212721830277712, 3875)
('Band of Brothers (2001)', 4.211012921375094, 2835)
('Law Abiding Citizen (2009)', 4.200478496684738, 4192)
('Firefly (2002)', 4.195350066534203, 895)

### User 2 - Scenario 1

TOP recommended movies (with more than 25 reviews):
('Pollyanna (2003)', 4.805593947106426, 33)
('The Bible (2013)', 4.7670745731646385, 31)
('Shaadi Mein Zaroor Aana (2017)', 4.700950824647783, 26)
('Love is God (2003)', 4.618226139505859, 27)
('Den radio (2001)', 4.607191996406179, 38)
('Rent: Filmed Live on Broadway (2008)', 4.602257329767031, 41)
('Border (1997)', 4.595495226646453, 38)
('The Butterfly Circus (2009)', 4.573118339696792, 44)
('North & South (2004)', 4.543987827272972, 485)
('Us Again (2021)', 4.530712454637214, 37)
('Iron Jawed Angels (2004)', 4.521868949143901, 50)
('I Can Only Imagine (2018)', 4.508390072967761, 81)
('Mower Minions (2016)', 4.508144195710038, 42)
('Trevor Noah: Son of Patricia (2018)', 4.506821801933262, 69)
('Drishyam 2 (2021)', 4.5052843366990825, 34)

### User 2 - Scenario 2

TOP recommended movies (with more than 100 reviews):
('North & South (2004)', 4.543987827272972, 485)
('Pride and Prejudice (1995)', 4.484005894475069, 3229)
('War Room (2015)', 4.458380950141752, 108)
('The Biggest Little Farm (2018)', 4.451077478196729, 121)
('Gifted Hands: The Ben Carson Story (2009)', 4.443992178835147, 153)
('Hamilton (2020)', 4.427148150887166, 2094)
('Sense & Sensibility (2008)', 4.408884334165066, 101)
('Hidden Figures (2016)', 4.40304559228319, 5878)
('Harry Potter and the Deathly Hallows: Part 2 (2011)', 4.361168749350742, 20837)
('The Hundred-Foot Journey (2014)', 4.352469671627779, 959)
('Spider-Man: Across the Spider-Verse (2023)', 4.344597143128013, 528)
('Piper (2016)', 4.3297060915042085, 2007)
('Fireproof (2008)', 4.327696953171022, 264)
('Harry Potter and the Deathly Hallows: Part 1 (2010)', 4.325495693359052, 21781)
('"Sound of Music', 4.32045234753871, 19092)