---------------------------
# Spark Lab Session - Exercices with Spark
---------------------------

## 1. Opening Spark

In [1]:
import pyspark
# import os ?

sc = pyspark.SparkContext(appName="Spark Lab Session")

## 2. First steps with Spark

### 2.1 First RDD

In [2]:
l = list(range(3000))
rddl = sc.parallelize(l)
rddl.take(20)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

### 2.2 Computing the sum of cubes

In [3]:
rddc = rddl.map(lambda x : x*x*x)
rddc.take(10)

[0, 1, 8, 27, 64, 125, 216, 343, 512, 729]

In [4]:
rddc.sum()

20236502250000

Remarque : on pouvait aussi faire un reduce avant de sommer, mais cela ne servait à rien ici (aspect pédagogique mis à part).

In [5]:
rddc.reduce(lambda x, y : x+y) # ce reduce ne fait rien ici
rddc.take(10)

[0, 1, 8, 27, 64, 125, 216, 343, 512, 729]

### 2.3 Last digits of elements in C

In [6]:
# Première étape : calculer la liste contenant
# les derniers chiffres
lastDigits = rddc.map(lambda x: x%10)
lastDigits.take(10)

[0, 1, 8, 7, 4, 5, 6, 3, 2, 9]

In [7]:
# Deuxième étape : compter combien de fois chaque
# items apparait 
countLastDigits = lastDigits.map(lambda x:(x,1)).reduceByKey(lambda x,y: x+y)
countLastDigits.take(20)

[(0, 300),
 (8, 300),
 (4, 300),
 (1, 300),
 (5, 300),
 (9, 300),
 (6, 300),
 (2, 300),
 (7, 300),
 (3, 300)]

In [8]:
# On peut aussi directement faire 
rddc.map(lambda x:(x%10,1)).reduceByKey(lambda x, y : x+y).collect()

[(0, 300),
 (8, 300),
 (4, 300),
 (1, 300),
 (5, 300),
 (9, 300),
 (6, 300),
 (2, 300),
 (7, 300),
 (3, 300)]

### 2.4 Digits of C

In [9]:
rddc.flatMap(lambda x : [(e,1) for e in str(x)]).reduceByKey(lambda x, y : x+y).collect()

[('4', 2762),
 ('7', 2787),
 ('6', 2713),
 ('3', 2814),
 ('0', 3127),
 ('1', 3667),
 ('8', 2639),
 ('9', 2521),
 ('2', 3294),
 ('5', 2653)]

En décomposant :

In [10]:
rddc.flatMap(lambda x : [(e,1) for e in str(x)]).collect()

[('0', 1),
 ('1', 1),
 ('8', 1),
 ('2', 1),
 ('7', 1),
 ('6', 1),
 ('4', 1),
 ('1', 1),
 ('2', 1),
 ('5', 1),
 ('2', 1),
 ('1', 1),
 ('6', 1),
 ('3', 1),
 ('4', 1),
 ('3', 1),
 ('5', 1),
 ('1', 1),
 ('2', 1),
 ('7', 1),
 ('2', 1),
 ('9', 1),
 ('1', 1),
 ('0', 1),
 ('0', 1),
 ('0', 1),
 ('1', 1),
 ('3', 1),
 ('3', 1),
 ('1', 1),
 ('1', 1),
 ('7', 1),
 ('2', 1),
 ('8', 1),
 ('2', 1),
 ('1', 1),
 ('9', 1),
 ('7', 1),
 ('2', 1),
 ('7', 1),
 ('4', 1),
 ('4', 1),
 ('3', 1),
 ('3', 1),
 ('7', 1),
 ('5', 1),
 ('4', 1),
 ('0', 1),
 ('9', 1),
 ('6', 1),
 ('4', 1),
 ('9', 1),
 ('1', 1),
 ('3', 1),
 ('5', 1),
 ('8', 1),
 ('3', 1),
 ('2', 1),
 ('6', 1),
 ('8', 1),
 ('5', 1),
 ('9', 1),
 ('8', 1),
 ('0', 1),
 ('0', 1),
 ('0', 1),
 ('9', 1),
 ('2', 1),
 ('6', 1),
 ('1', 1),
 ('1', 1),
 ('0', 1),
 ('6', 1),
 ('4', 1),
 ('8', 1),
 ('1', 1),
 ('2', 1),
 ('1', 1),
 ('6', 1),
 ('7', 1),
 ('1', 1),
 ('3', 1),
 ('8', 1),
 ('2', 1),
 ('4', 1),
 ('1', 1),
 ('5', 1),
 ('6', 1),
 ('2', 1),
 ('5', 1),
 ('1', 1),

## 3. Approximating $\pi$

To compute the value of $\pi$, you will generate the list of all pairs $(x,y)$ of integers from 0 to K. Then you will compute the number of such pairs such that $(2x+1)^2+(2y+1)^2$ is less then $(2*K)^2$. The ratio between the number of such pairs and the number of total pairs is an approximation of $\pi$. For $K=3000$ you should obtain a value close to 3.14159.

### 3.1 Step 1 : computing set of pairs


### 3.2 Step 2 : counting the pairs

### 3.3 Step 3 : counting the approximaton

In [16]:
K=3000
intUpToK = sc.parallelize(range(K))
pairs = intUpToK.cartesian(intUpToK)
nbTotal = pairs.count()

def isOk(v):
    x,y = v
    return (2*x+1)**2+(2*y+1)**2 <= 4*K*K

nbOk = pairs.filter(isOk).count()
print(4*float(nbOk)/nbTotal)
print(nbOk)

3.1415933333333332
7068585


## 4. Using the Movie Lens dataset
### 4.1 Getting the dataset