---------------------------
# Spark Lab Session - Exercices with Spark
---------------------------
*Thomas KOCH*

## 1. Opening Spark

In [1]:
import pyspark
import random

sc = pyspark.SparkContext(appName="Spark Lab Session")

## 2. First steps with Spark

### 2.1 First RDD

In [2]:
l = list(range(3000))
rddl = sc.parallelize(l)
rddl.take(20)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

### 2.2 Computing the sum of cubes

In [3]:
rddc = rddl.map(lambda x : x*x*x)
rddc.take(10)

[0, 1, 8, 27, 64, 125, 216, 343, 512, 729]

In [4]:
rddc.sum()

20236502250000

Remarque : on pouvait aussi faire un reduce avant de sommer, mais cela ne servait à rien ici (aspect pédagogique mis à part).

In [5]:
rddc.reduce(lambda x, y : x+y) # ce reduce ne fait rien ici
rddc.take(10)

[0, 1, 8, 27, 64, 125, 216, 343, 512, 729]

### 2.3 Last digits of elements in C

In [6]:
# Première étape : calculer la liste contenant
# les derniers chiffres
lastDigits = rddc.map(lambda x: x%10)
lastDigits.take(10)

[0, 1, 8, 7, 4, 5, 6, 3, 2, 9]

In [7]:
# Deuxième étape : compter combien de fois chaque
# items apparait 
countLastDigits = lastDigits.map(lambda x:(x,1)).reduceByKey(lambda x,y: x+y)
countLastDigits.take(20)

[(0, 300),
 (8, 300),
 (4, 300),
 (1, 300),
 (5, 300),
 (9, 300),
 (6, 300),
 (2, 300),
 (7, 300),
 (3, 300)]

In [8]:
# On peut aussi directement faire 
rddc.map(lambda x:(x%10,1)).reduceByKey(lambda x, y : x+y).collect()

[(0, 300),
 (8, 300),
 (4, 300),
 (1, 300),
 (5, 300),
 (9, 300),
 (6, 300),
 (2, 300),
 (7, 300),
 (3, 300)]

### 2.4 Digits of C

In [9]:
rddc.flatMap(lambda x : [(e,1) for e in str(x)]).reduceByKey(lambda x, y : x+y).collect()

[('4', 2762),
 ('7', 2787),
 ('6', 2713),
 ('3', 2814),
 ('0', 3127),
 ('1', 3667),
 ('8', 2639),
 ('9', 2521),
 ('2', 3294),
 ('5', 2653)]

En décomposant :

In [10]:
rddc.flatMap(lambda x : [(e,1) for e in str(x)]).collect()

[('0', 1),
 ('1', 1),
 ('8', 1),
 ('2', 1),
 ('7', 1),
 ('6', 1),
 ('4', 1),
 ('1', 1),
 ('2', 1),
 ('5', 1),
 ('2', 1),
 ('1', 1),
 ('6', 1),
 ('3', 1),
 ('4', 1),
 ('3', 1),
 ('5', 1),
 ('1', 1),
 ('2', 1),
 ('7', 1),
 ('2', 1),
 ('9', 1),
 ('1', 1),
 ('0', 1),
 ('0', 1),
 ('0', 1),
 ('1', 1),
 ('3', 1),
 ('3', 1),
 ('1', 1),
 ('1', 1),
 ('7', 1),
 ('2', 1),
 ('8', 1),
 ('2', 1),
 ('1', 1),
 ('9', 1),
 ('7', 1),
 ('2', 1),
 ('7', 1),
 ('4', 1),
 ('4', 1),
 ('3', 1),
 ('3', 1),
 ('7', 1),
 ('5', 1),
 ('4', 1),
 ('0', 1),
 ('9', 1),
 ('6', 1),
 ('4', 1),
 ('9', 1),
 ('1', 1),
 ('3', 1),
 ('5', 1),
 ('8', 1),
 ('3', 1),
 ('2', 1),
 ('6', 1),
 ('8', 1),
 ('5', 1),
 ('9', 1),
 ('8', 1),
 ('0', 1),
 ('0', 1),
 ('0', 1),
 ('9', 1),
 ('2', 1),
 ('6', 1),
 ('1', 1),
 ('1', 1),
 ('0', 1),
 ('6', 1),
 ('4', 1),
 ('8', 1),
 ('1', 1),
 ('2', 1),
 ('1', 1),
 ('6', 1),
 ('7', 1),
 ('1', 1),
 ('3', 1),
 ('8', 1),
 ('2', 1),
 ('4', 1),
 ('1', 1),
 ('5', 1),
 ('6', 1),
 ('2', 1),
 ('5', 1),
 ('1', 1),

## 3. Approximating $\pi$

To compute the value of $\pi$, you will generate the list of all pairs $(x,y)$ of integers from 0 to K. Then you will compute the number of such pairs such that $(2x+1)^2+(2y+1)^2$ is less then $(2*K)^2$. The ratio between the number of such pairs and the number of total pairs is an approximation of $\pi$. For $K=3000$ you should obtain a value close to 3.14159.

### 3.1 Step 1 : computing set of pairs


### 3.2 Step 2 : counting the pairs

### 3.3 Step 3 : counting the approximaton

In [11]:
K=3000
intUpToK = sc.parallelize(range(K))
pairs = intUpToK.cartesian(intUpToK)
nbTotal = pairs.count()

def isOk(v):
    x,y = v
    return (2*x+1)**2+(2*y+1)**2 <= 4*K*K

nbOk = pairs.filter(isOk).count()
print(4*float(nbOk)/nbTotal)
print(nbOk)

3.1415933333333332
7068585


## 4. Using the Movie Lens dataset
### 4.1 Getting the dataset
The dataset is in the ***ml-latest-small*** folder.
### 4.2 Getting the dataset into an RDD
To read the dataset we can use the following code :

In [12]:
import re

path_data = "/home/p5hngk/Downloads/GitHub/SD_701---Data_Mining/ml-latest-small"

In [13]:
ratingsFile = sc.textFile(path_data+"/ratings.csv")
moviesFile = sc.textFile(path_data+"/movies.csv")
moviesFile.take(5)

['movieId,title,genres',
 '1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy',
 '2,Jumanji (1995),Adventure|Children|Fantasy',
 '3,Grumpier Old Men (1995),Comedy|Romance',
 '4,Waiting to Exhale (1995),Comedy|Drama|Romance']

In [14]:
ratingsFile.take(5)

['userId,movieId,rating,timestamp',
 '1,1,4.0,964982703',
 '1,3,4.0,964981247',
 '1,6,4.0,964982224',
 '1,47,5.0,964983815']

### 4.3 Cleaning data
We're going to clean the data with the `parseCSV` function that we can define like this :

In [15]:
future_pattern = re.compile("""([^,"]+|"[^"]+")(?=,|$)""")

def parseCSV(line): 
     return future_pattern.findall(line)

So we can now use this function to clean our datas :

In [16]:
ratings = ratingsFile.map(parseCSV).filter(lambda x: x[0]!="userId") #on enlève la première ligne pour faciliter les calculs
ratings.take(5)

[['1', '1', '4.0', '964982703'],
 ['1', '3', '4.0', '964981247'],
 ['1', '6', '4.0', '964982224'],
 ['1', '47', '5.0', '964983815'],
 ['1', '50', '5.0', '964982931']]

In [17]:
movies = moviesFile.map(parseCSV).filter(lambda x : x[0]!="movieId") #on enlève la première ligne
movies.take(5)

[['1', 'Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy'],
 ['2', 'Jumanji (1995)', 'Adventure|Children|Fantasy'],
 ['3', 'Grumpier Old Men (1995)', 'Comedy|Romance'],
 ['4', 'Waiting to Exhale (1995)', 'Comedy|Drama|Romance'],
 ['5', 'Father of the Bride Part II (1995)', 'Comedy']]

### 4.4 Compute the 10 best rated movies of all times
We want now to compute the 10 movies that have the best average rating by using `sortBy` and `take`.                      

We will first compute the average in the simplest way, that is : 
<center>
$\frac{sum(ratings)}{numberOfRatings}$
</center>

In [18]:
ratedMovie = ratings.map(lambda x : (x[1],(float(x[2]),1))).reduceByKey(lambda x,y : (x[0]+y[0],x[1]+y[1]))
ratedMovie.take(5)

[('1', (843.0, 215)),
 ('50', (864.5, 204)),
 ('70', (193.0, 55)),
 ('110', (955.5, 237)),
 ('157', (31.5, 11))]

In [19]:
ratedMovie1 = ratedMovie.mapValues(lambda x: x[0]/x[1]).sortBy(lambda x : x[1], ascending=False)
ratedMovie1.take(10)

[('6835', 5.0),
 ('1151', 5.0),
 ('1631', 5.0),
 ('102217', 5.0),
 ('27523', 5.0),
 ('53', 5.0),
 ('1140', 5.0),
 ('8238', 5.0),
 ('47736', 5.0),
 ('53355', 5.0)]

So we have here the top 10 ***movieId*** with the ***average rating***. Let's try to give the ***title*** for each ***movieId***.

### 4.5 Ordered list of movies with names

In [20]:
movieJoin = movies.map(lambda x: (x[0], x[1]))
movieJoin.take(5)

[('1', 'Toy Story (1995)'),
 ('2', 'Jumanji (1995)'),
 ('3', 'Grumpier Old Men (1995)'),
 ('4', 'Waiting to Exhale (1995)'),
 ('5', 'Father of the Bride Part II (1995)')]

In [21]:
movieWithAvg1 = movieJoin.join(ratedMovie1)
movieWithAvg1.take(5)

[('4', ('Waiting to Exhale (1995)', 2.357142857142857)),
 ('10', ('GoldenEye (1995)', 3.496212121212121)),
 ('12', ('Dracula: Dead and Loving It (1995)', 2.4210526315789473)),
 ('16', ('Casino (1995)', 3.926829268292683)),
 ('20', ('Money Train (1995)', 2.5))]

In [22]:
movieWithAvg1 = movieWithAvg1.map(lambda x : x[1])
movieWithAvg1.take(5)

[('Waiting to Exhale (1995)', 2.357142857142857),
 ('GoldenEye (1995)', 3.496212121212121),
 ('Dracula: Dead and Loving It (1995)', 2.4210526315789473),
 ('Casino (1995)', 3.926829268292683),
 ('Money Train (1995)', 2.5)]

In [23]:
top10Movies1 = movieWithAvg1.sortBy(lambda x : x[1], ascending=False).take(10)
top10Movies1

[('Lamerica (1994)', 5.0),
 ('What Happened Was... (1994)', 5.0),
 ('Denise Calls Up (1995)', 5.0),
 ('Lesson Faust (1994)', 5.0),
 ('"Sandpiper, The (1965)"', 5.0),
 ('My Man Godfrey (1957)', 5.0),
 ('Black Tar Heroin: The Dark End of the Street (2000)', 5.0),
 ('Slumber Party Massacre II (1987)', 5.0),
 ('Moscow Does Not Believe in Tears (Moskva slezam ne verit) (1979)', 5.0),
 ('Cherish (2002)', 5.0)]

We can here understand that many movies have a five stars average. So **the rule we have used to compute de average rating is not so good**. 

### 4.6 Better ordered list
Let's try to compute it in an other way : 
<center>
$\frac{sum(ratings)}{1+numberOfRatings}$
</center>

In this way, we decided to put one 0 vote to every movies.

In [24]:
ratedMovie2 = ratedMovie.mapValues(lambda x: (x[0])/(1+x[1]))

movieWithAvg2 = movieJoin.join(ratedMovie2).map(lambda x: x[1])

top10movies2 = movieWithAvg2.sortBy(lambda x: x[1], ascending=False).take(10)
top10movies2

[('"Shawshank Redemption, The (1994)"', 4.415094339622642),
 ('"Godfather, The (1972)"', 4.266839378238342),
 ('"Streetcar Named Desire, A (1951)"', 4.261904761904762),
 ('Fight Club (1999)', 4.2534246575342465),
 ('"Godfather: Part II, The (1974)"', 4.226923076923077),
 ('Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)',
  4.224489795918367),
 ('"Three Billboards Outside Ebbing, Missouri (2017)"', 4.222222222222222),
 ('"Usual Suspects, The (1995)"', 4.217073170731707),
 ('Goodfellas (1990)', 4.216535433070866),
 ('Star Wars: Episode IV - A New Hope (1977)', 4.214285714285714)]

The second rule is a little bit better, but we can improve the model by using a new rule. The idea is to put a weight for movies that have a great number of rating compared to those wo haven't many ratings : 
<center>
$\frac{sum(ratings)}{1+numberOfRatings}.log(numberOfRatings)$
</center>

In [25]:
import math

ratedMovie3 = ratedMovie.mapValues(lambda x: (x[0]/x[1])*math.log(x[1]))

movieWithAvg3 = movieJoin.join(ratedMovie3).map(lambda x: x[1])

top10movies3 = movieWithAvg3.sortBy(lambda x: x[1], ascending=False).take(10)
top10movies3

[('"Shawshank Redemption, The (1994)"', 25.506303124680446),
 ('Forrest Gump (1994)', 24.135559630846686),
 ('Pulp Fiction (1994)', 24.035971735394476),
 ('"Matrix, The (1999)"', 23.593497870526754),
 ('"Silence of the Lambs, The (1991)"', 23.43310709209536),
 ('Star Wars: Episode IV - A New Hope (1977)', 23.37860964684444),
 ('Fight Club (1999)', 23.007601610036865),
 ("Schindler's List (1993)", 22.788076383338726),
 ('Star Wars: Episode V - The Empire Strikes Back (1980)', 22.561506207236786),
 ('"Godfather, The (1972)"', 22.549726244087907)]

## 5. Movie recommendation

Our goal here is to make recommandation for **userId number 1**.  

To start with movie recommendation we're going to compute the set of movies that user 1 rated.

In [26]:
moviesUser1 = ratings.filter(lambda x: x[0]=="1") \
                        .map(lambda x : (x[1], x[2])) \
                        .join(movies) \
                        .map(lambda x: (x[1][0],x[1][1])) \
                        .sortBy(lambda x: x[0])
for el in moviesUser1.collect() :
    print(el)
print("\n User 1 has rated {} movies".format(moviesUser1.count()))

('1.0', '"Talented Mr. Ripley, The (1999)"')
('2.0', 'I Still Know What You Did Last Summer (1998)')
('2.0', 'Psycho (1998)')
('2.0', 'Psycho (1960)')
('2.0', 'Toys (1992)')
('2.0', '"Mummy, The (1999)"')
('3.0', 'From Dusk Till Dawn (1996)')
('3.0', 'Mission: Impossible (1996)')
('3.0', 'Twister (1996)')
('3.0', 'Escape to Witch Mountain (1975)')
('3.0', 'Starship Troopers (1997)')
('3.0', 'Blown Away (1994)')
('3.0', '"Shining, The (1980)"')
('3.0', 'Men in Black (a.k.a. MIB) (1997)')
('3.0', '"Rocky Horror Picture Show, The (1975)"')
('3.0', 'Clerks (1994)')
('3.0', 'Pulp Fiction (1994)')
('3.0', 'Stargate (1994)')
('3.0', 'Independence Day (a.k.a. ID4) (1996)')
('3.0', 'Sneakers (1992)')
('3.0', '"Last of the Mohicans, The (1992)"')
('3.0', "McHale's Navy (1997)")
('3.0', 'I Know What You Did Last Summer (1997)')
('3.0', 'Young Sherlock Holmes (1985)')
('3.0', "Logan's Run (1976)")
('3.0', 'Mrs. Doubtfire (1993)')
('3.0', 'Space Jam (1996)')
('3.0', "Pete's Dragon (1977)")
('3.0', 

### 5.1 Similarity

Let $x$ and $y$ be two users. We note :
* $m$ a movie
* $rated$ the set of movies rated by both users $x$ and $y$
* $ratX$ the ratings of $x$ on those movies
* $ratY$ the ratings of $y$ on those movies
We then define the similarity between users $x$ and $y$ as :


$$
simil(x,y) = \frac{\sum_{m\in rated}(rating(x,m)-mean(ratX))\ \times \ (rating(y,m)-mean(ratY))}{(variance(ratX)\ \times \ variance(ratY))^{0.5}}\times log(1+rated)
$$
    
The similarity is based on **Pearson** correlation coefficient modified to take into account the number of rated items (the $log(1+rated)$ factor avoids spurious correlations). Note that for true recommendation we set $simil(x, x) = 0$ so that our recommendation system is not polluted by auto recommendation.

We will now compute an RDD containing pairs $(x,s)$ where $x$ is a userId and $s = simil(x,1)$.

In [27]:
def avg(dic, items):
    s=0
    for i in items:
        s += dic[i]
    return s/len(items)

def safeDivide(x,y):
    if y==0:
        return 0
    return x/y


def simil(x,y):
    x = [e for e in x]
    y = [e for e in y]
    if x==y:
        return 0
    dX = dict(x)
    dY = dict(y)
    common = set(dX.keys()) & set(dY.keys())
    if len(common)<1:
        return 0
    avgX = avg(dX,common)
    avgY = avg(dY,common)
    varX = 0
    varY = 0
    res = 0 
    for c in common:
        res += (dX[c]-avgX)*(dY[c]-avgY)
        varX += (dX[c]-avgX)**2
        varY += (dY[c]-avgY)**2
    if varX*varY == 0:
        return 0
    if res < 0:
        return 0
    return math.log(1+len(common))*res / ((varX*varY)**(0.5))

In [28]:
userRatings = ratings.map(lambda x: (x[0],(x[1],float(x[2])))).groupByKey() 
user1 = userRatings.filter(lambda x:x[0]=='1').take(1)[0]
userSimil = userRatings.mapValues(lambda x: simil(x,user1[1]))
userSimil.collect() # (user, simil(user, 1))

[('1', 0),
 ('4', 0.7962921953903238),
 ('8', 1.3021968052914652),
 ('9', 1.6458361655282818),
 ('10', 0),
 ('12', 0),
 ('14', 0.6045580623351945),
 ('16', 0.6323082612198457),
 ('17', 0.13885218765886073),
 ('19', 1.5674603326251801),
 ('20', 1.4798062863831714),
 ('21', 0.4146783261088008),
 ('22', 0),
 ('24', 0.8114170851816215),
 ('26', 0.2940774430405641),
 ('29', 0),
 ('33', 0.2949751791361563),
 ('34', 0.08970947094849752),
 ('40', 0),
 ('44', 1.80629636167389),
 ('45', 1.234093512385323),
 ('48', 0),
 ('50', 0),
 ('53', 0),
 ('54', 0),
 ('56', 0.7040930522865744),
 ('57', 1.6747239181504423),
 ('60', 0),
 ('63', 1.129900956989291),
 ('64', 0.9504362702356051),
 ('66', 0.654933426678369),
 ('68', 0.14233504039670775),
 ('69', 0.83596695360366),
 ('70', 0),
 ('73', 0),
 ('74', 0),
 ('77', 0),
 ('82', 0.399866416635122),
 ('83', 0.44824063670235775),
 ('84', 0),
 ('86', 0.7112700663625092),
 ('88', 0.47821027410463074),
 ('91', 0.4563746432146068),
 ('93', 0.7068074410643237),
 ('

Now let's compute :
* $simil(1, 1)$
* $simil(1, 2)$
* $simil(1, 3)$ 
* $simil(1, 4)$ 

In [29]:
output = userSimil.filter(lambda x: x[0] in ['1','2','3','4']).sortBy(lambda x: x[0]).collect() # (user, simil(user, 1))
for (k,v) in output:
    print("{} has simil(1,{}) = {}".format(k, k, v))

1 has simil(1,1) = 0
2 has simil(1,2) = 0
3 has simil(1,3) = 0.16597864726681164
4 has simil(1,4) = 0.7962921953903238


### 5.2 Grade of a movie

The recommendation grade of a movie $m$ for user 1 will be the average of ratings by other users ponderated by similarities between users. As in section 4.6, we don’t want this list polluted by averages computed over very few values, therefore in practice we compute the following value :

$$
grade(m)= \frac{\sum_{x\in rated(x)}rating(x,m) \ \times \ simil(x,1)}{\alpha + \sum_{x\in rated(x)}simil(x,1)}
$$

Where $\alpha ≈ 0.5$ is the average value of $simil(x,1)$.

To start with, we will compute an RDD containing pairs of movie id and their grade.

In [30]:
alpha = userSimil.map(lambda x:x[1]).reduce(lambda x,y: x+y) / userSimil.count()
alpha

0.5171211853962832

In [31]:
userMovieRatings = ratings.map(lambda x: (x[0],(x[1],float(x[2])))) # (user, (movie,rating))

movieRatingSimil = userMovieRatings.join(userSimil).map(lambda x:x[1]) # (movie,rating),simil

moviePearsonWeight = movieRatingSimil.map(lambda x: (x[0][0],(x[0][1]*x[1],x[1]))) # movie, (rating*simil,simil)
moviePearsonWeight.collect()

[('21', (2.3888765861709715, 0.7962921953903238)),
 ('32', (1.5925843907806476, 0.7962921953903238)),
 ('45', (2.3888765861709715, 0.7962921953903238)),
 ('47', (1.5925843907806476, 0.7962921953903238)),
 ('52', (2.3888765861709715, 0.7962921953903238)),
 ('58', (2.3888765861709715, 0.7962921953903238)),
 ('106', (3.1851687815612952, 0.7962921953903238)),
 ('125', (3.981460976951619, 0.7962921953903238)),
 ('126', (0.7962921953903238, 0.7962921953903238)),
 ('162', (3.981460976951619, 0.7962921953903238)),
 ('171', (2.3888765861709715, 0.7962921953903238)),
 ('176', (3.981460976951619, 0.7962921953903238)),
 ('190', (1.5925843907806476, 0.7962921953903238)),
 ('215', (3.981460976951619, 0.7962921953903238)),
 ('222', (0.7962921953903238, 0.7962921953903238)),
 ('232', (3.981460976951619, 0.7962921953903238)),
 ('235', (1.5925843907806476, 0.7962921953903238)),
 ('247', (2.3888765861709715, 0.7962921953903238)),
 ('260', (3.981460976951619, 0.7962921953903238)),
 ('265', (3.981460976951

In [32]:
moviePearson = moviePearsonWeight.reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1])).mapValues(lambda x: safeDivide(x[0],alpha+x[1]))
# movieId, grade
moviePearson.sortBy(lambda x: -x[1]).take(20)

[('318', 4.536188571941084),
 ('260', 4.4313532859271465),
 ('3030', 4.428198374499755),
 ('1235', 4.424977941094166),
 ('1196', 4.4245718874500115),
 ('527', 4.400726719978458),
 ('858', 4.3966369373314516),
 ('3983', 4.392449319598867),
 ('1178', 4.3828267800709115),
 ('48516', 4.379880103638749),
 ('177593', 4.370437198151153),
 ('1136', 4.3685402874449055),
 ('6442', 4.367473266643553),
 ('912', 4.364763255353687),
 ('1208', 4.343985487387523),
 ('750', 4.332851385313492),
 ('5618', 4.332057410997013),
 ('50', 4.326183192574438),
 ('1217', 4.324759188220796),
 ('1276', 4.321940188006373)]

What by joining movie titles gives :

In [33]:
moviePearsonTitle = movies.join(moviePearson).map(lambda x: x[1]).sortBy(lambda x: -x[1])
for i in moviePearsonTitle.take(10):
        print(str(i[1])+"\t"+str(i[0]))

4.536188571941084	"Shawshank Redemption, The (1994)"
4.4313532859271465	Star Wars: Episode IV - A New Hope (1977)
4.428198374499755	Yojimbo (1961)
4.424977941094166	Harold and Maude (1971)
4.4245718874500115	Star Wars: Episode V - The Empire Strikes Back (1980)
4.400726719978458	Schindler's List (1993)
4.3966369373314516	"Godfather, The (1972)"
4.392449319598867	You Can Count on Me (2000)
4.3828267800709115	Paths of Glory (1957)
4.379880103638749	"Departed, The (2006)"


In [34]:
moviePearsonTitle.count()

9724

### 5.3 Finishing

There is still some other things we can done, like giving the best movies for user 1. To make this, we just have to remove from the previous list the movies already rated (thus seen) bu user 1.

In [35]:
moviePearsonTitle1 = moviePearsonTitle.subtractByKey(moviesUser1.map(lambda x: (x[1], '#'))) \
                                        .sortBy(lambda x: (-x[1], x[0]))
for i in moviePearsonTitle1.take(10):
    print(str(i[1])+"\t"+str(i[0]))
print("\n => User 1 has rated {} movies".format(moviesUser1.count()))
print(" => We can recommand {} movies to User 1 that he has not seen".format(moviePearsonTitle1.count()))

4.536188571941084	"Shawshank Redemption, The (1994)"
4.428198374499755	Yojimbo (1961)
4.424977941094166	Harold and Maude (1971)
4.3966369373314516	"Godfather, The (1972)"
4.392449319598867	You Can Count on Me (2000)
4.3828267800709115	Paths of Glory (1957)
4.379880103638749	"Departed, The (2006)"
4.370437198151153	"Three Billboards Outside Ebbing, Missouri (2017)"
4.367473266643553	Belle époque (1992)
4.364763255353687	Casablanca (1942)

 => User 1 has rated 232 movies
 => We can recommand 9492 movies to User 1 that he has not seen


Last, we just want to know how good is our recommendation compare with the rating of user 1.

In [81]:
user1ratingsMovieId = ratings.filter(lambda x: x[0]=="1").map(lambda x : (x[1], x[2])) # movieId, rating of user 1
moviePearsonTitle1_1 = movies.join(moviePearson.join(user1ratingsMovieId)).map(lambda x: (x[1][0],x[1][1][0],x[1][1][1])).sortBy(lambda x: -float(x[1]))

print("Simil Ratings \t User1 ratings \t Movie Title \n")
for i in moviePearsonTitle1_1.collect():
        print(str(i[1])+"\t"+str(i[2])+"\t"+str(i[0]))

moviePearsonTitle1_1.count()

Simil Ratings 	 User1 ratings 	 Movie Title 

4.4313532859271465	5.0	Star Wars: Episode IV - A New Hope (1977)
4.4245718874500115	5.0	Star Wars: Episode V - The Empire Strikes Back (1980)
4.400726719978458	5.0	Schindler's List (1993)
4.3685402874449055	5.0	Monty Python and the Holy Grail (1975)
4.343985487387523	4.0	Apocalypse Now (1979)
4.326183192574438	5.0	"Usual Suspects, The (1995)"
4.31619362873226	5.0	Fight Club (1999)
4.303758132477768	5.0	"Princess Bride, The (1987)"
4.301331618811739	5.0	"Matrix, The (1999)"
4.2762817327780205	5.0	American History X (1998)
4.273634036580454	5.0	Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
4.235833511907756	5.0	American Beauty (1999)
4.2336971355367865	5.0	Goodfellas (1990)
4.222245956832779	5.0	Star Wars: Episode VI - Return of the Jedi (1983)
4.208491338666775	5.0	Office Space (1999)
4.201122159662487	5.0	"Green Mile, The (1999)"
4.170637406114562	5.0	L.A. Confidential (1997)
4.168216918414782	5.0	Reservoir 

232