In [0]:
complete_dataset_url = 'http://files.grouplens.org/datasets/movielens/ml-latest.zip'
small_dataset_url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'

In [0]:
import os
# os.mkdir('datasets')

In [0]:

datasets_path = os.path.join('..', 'content/datasets')

complete_dataset_path = os.path.join(datasets_path, 'ml-latest.zip')
small_dataset_path = os.path.join(datasets_path, 'ml-latest-small.zip')

In [0]:
import urllib.request

small_f = urllib.request.urlretrieve (small_dataset_url, small_dataset_path)
complete_f = urllib.request.urlretrieve (complete_dataset_url, complete_dataset_path)

In [0]:
# Extract content from zip files

import zipfile

with zipfile.ZipFile(small_dataset_path, "r") as z:
    z.extractall(datasets_path)

with zipfile.ZipFile(complete_dataset_path, "r") as z:
    z.extractall(datasets_path)

 
**Data Cleaning**
1.   For each line in the ratings dataset, we create a tuple of (UserID, MovieID, Rating). We drop the timestamp because we do not need it for this recommender.
2.  For each line in the movies dataset, we create a tuple of (MovieID, Title). We drop the genres because we do not use them for this recommender.



In [0]:
# !pip3 install pyspark

In [0]:
from pyspark import SparkContext
sc =SparkContext()

In [0]:
# From raw ratings data filter out the header, included in each file 

small_ratings_file = os.path.join(datasets_path, 'ml-latest-small', 'ratings.csv')

small_ratings_raw_data = sc.textFile(small_ratings_file)
small_ratings_raw_data_header = small_ratings_raw_data.take(1)[0]

In [0]:
# Partse the raw data into a new RDD

small_ratings_data = small_ratings_raw_data.filter(lambda line: line!=small_ratings_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1],tokens[2])).cache()

In [8]:
# Test whether working or not

small_ratings_data.take(3)

[('1', '1', '4.0'), ('1', '3', '4.0'), ('1', '6', '4.0')]

In [9]:
# Perform similar operation for movies.csv file 

small_movies_file = os.path.join(datasets_path, 'ml-latest-small', 'movies.csv')

small_movies_raw_data = sc.textFile(small_movies_file)
small_movies_raw_data_header = small_movies_raw_data.take(1)[0]

small_movies_data = small_movies_raw_data.filter(lambda line: line!=small_movies_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1])).cache()
    
small_movies_data.take(3)

[('1', 'Toy Story (1995)'),
 ('2', 'Jumanji (1995)'),
 ('3', 'Grumpier Old Men (1995)')]

**Collaborative Filetering**

Spark MLlib library for Machine Learning provides a Collaborative Filtering implementation by using Alternating Least Squares (ALS).

The implementation in MLlib has the following parameters: 

*   `numBlocks` = no. of blocks used to parallelize computation
*   `rank` = no. of latent factors in the model
*   `iterations` = no. of iterations to run
*   `lambda` = regularization parameter in ALS
*    `implicitPrefs` =  whether to use the explicit feedback ALS variant or one adapted for implicit feedback data





In [0]:
# Selecting the best parameters using ALS dataset
training_RDD, validation_RDD, test_RDD = small_ratings_data.randomSplit([6, 2, 2], seed=42)


In [0]:
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

In [0]:
# import os       #importing os to set environment variable
# def install_java():
#   !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
#   os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
#   !java -version       #check java version
# install_java()

In [0]:
# !sudo apt-get purge openjdk-\* icedtea-\* icedtea6-\*
# !sudo apt autoremove
# !sudo apt install openjdk-8-jre-headless


In [0]:
# !java -version

In [15]:
# Training phase

from pyspark.mllib.recommendation import ALS
import math

seed = 43
iterations = 10
regularization_parameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.02

min_error = float('inf')
best_rank = -1
best_iteration = -1

for rank in ranks:
    model = ALS.train(training_RDD, rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
    predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = error
    err += 1
    print ('For rank %s the RMSE is %s' % (rank, error))
    if error < min_error:
        min_error = error
        best_rank = rank

print ('The best model was trained with rank %s' % best_rank)

For rank 4 the RMSE is 0.9064372777135662
For rank 8 the RMSE is 0.9091906850626293
For rank 12 the RMSE is 0.9140608348862169
The best model was trained with rank 4


In [17]:
# Denotes UserID, MovieID and the Rating 
predictions.take(3)

[((140, 1084), 3.231818212336625),
 ((74, 1084), 3.987631471353916),
 ((402, 1084), 3.7486550840147554)]

In [19]:
# Join with the validation data and the result looks like 
rates_and_preds.take(3)

[((1, 553), (5.0, 4.781500981595887)),
 ((1, 673), (3.0, 2.7816958851413034)),
 ((1, 1025), (5.0, 4.126055553786673))]