# Data Loading using SparkContext

In [1]:
import pyspark as ps
sc = ps.SparkContext('local[*]')
import json

fields = ['product_id', 'user_id', 'review', 'profile_name', 'helpfulness', 'score', 'time']

def validate(x):
    for a in fields:
        if a not in x:
            return False
    return True

review = sc.textFile('movies.json').map(lambda x: json.loads(x)).filter(validate)

24/10/27 15:28:11 WARN Utils: Your hostname, DESKTOP-2J74AJH resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/10/27 15:28:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/27 15:28:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/27 15:28:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Number of movies, users and enteries

In [2]:
print(f"Number of movies = {review.map(lambda x: x['product_id']).distinct().count()}")
print(f"Number of users = {review.map(lambda x: x['user_id']).distinct().count()}")
print(f"Number of enteries = {review.count()}")

                                                                                

Number of movies = 1539


                                                                                

Number of users = 36409


[Stage 4:>                                                          (0 + 2) / 2]

Number of enteries = 50000


                                                                                

# Using ALS

In [3]:
from pyspark.mllib.recommendation import ALS
import hashlib

def get_hash(x):
    return int(hashlib.sha1(x).hexdigest(), 16) % (10 ** 8)

rating = review.map(lambda x: (get_hash(x['user_id'].encode('utf-8')), get_hash(x['product_id'].encode('utf-8')), int(x['score'])))
train = rating.filter(lambda x: ((x[0] + x[1]) % 10) >= 2)
test = rating.filter(lambda x: ((x[0] + x[1]) % 10) < 2)
# no. of training samples
print(train.count())
# no. of testing samples
print(test.count())

39992
10008


In [4]:
# building model
rank = 10
numiterations = 10
model = ALS.train(train, rank, numiterations)
format_test = test.map(lambda x: (int(x[0]), int(x[1])))
predict = model.predictAll(format_test).map(lambda x: ((int(x[0]), int(x[1])), int(x[2])))
true_and_predict = test.map(lambda x: ((int(x[0]), int(x[1])), int(x[2]))).join(predict)
mse = true_and_predict.map(lambda x: (((int(x[1][0]) - int(x[1][1])) ** 2), 1)).reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]))
print(mse[0] / mse[1])

24/10/27 15:28:35 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/10/27 15:28:35 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK

15.952486801889414


                                                                                