# RDD

## Read and Play 

In [1]:
# Read data from local
fruits = sc.textFile('../data/fruits.txt')
yellowThings = sc.textFile('../data/yellowthings.txt')

In [2]:
# Read data from HDFS
HDFS = sc.textFile('hdfs://url:9000/fruits.txt')

RDD actions:
1. rdd.collect() list of all items
2. rdd.count() long type, number of items in this rdd
3. rdd.first() first item
4. rdd.reduce()
5. rdd.lookup()
6. rdd.save()


RDD Transformations:
1. rdd.map() Returns a stream consisting of the results of applying the given function to the elements of this stream.
2. rdd.filter()
3. rdd.flatMap() Returns a stream consisting of the results of replacing each element of this stream with the contents of a mapped stream produced by applying the provided mapping function to each element.
4. rdd.sample()
5. rdd.groupByKey()
6. rdd.reduceByKey()
7. union()
8. join()
9. cogroup()
10. corssProduct()
11. rdd.mapValues()
12. rdd.sort()
13. rdd.partitionBy()
14. rdd.glom()

## Transformation with Function 

In [15]:
# union 
fruitsAndYellowThings = fruits.union(yellowThings)
# intersection
yellowFruits = fruits.intersection(yellowThings)
# distinct
distinctFruitsAndYellowThings = fruitsAndYellowThings.distinct()
# check
fruitsAndYellowThings.count() == yellowFruits.count() + distinctFruitsAndYellowThings.count()

True

In [8]:
# filter
k = 5
shortFruits = fruits.filter(lambda fruit: len(fruit) <= k)
print(shortFruits.collect())
fruits_with_a = fruits.filter(lambda fruit: 'a' in fruit)
print(fruits_with_a.collect())

['apple', 'grap', 'lemon']
['apple', 'banana', 'canary melon', 'grap', 'orange', 'pineapple', 'strawberry']


In [18]:
# map 把二箱鸡蛋分别加工成煎蛋，还是放成原来的两箱，分给2组学生
# [::-1] 翻转读取
fruitsReversed = fruits.map(lambda fruit: fruit[::-1])
print(fruitsReversed.collect())

['elppa', 'ananab', 'nolem yranac', 'parg', 'nomel', 'egnaro', 'elppaenip', 'yrrebwarts']


In [9]:
# flatMap 把二箱鸡蛋分别加工成煎蛋，然后放到一起（100个煎蛋），分给100个学生
characters = fruits.flatMap(lambda fruit: list(fruit))
print(characters.collect())

['a', 'p', 'p', 'l', 'e', 'b', 'a', 'n', 'a', 'n', 'a', 'c', 'a', 'n', 'a', 'r', 'y', ' ', 'm', 'e', 'l', 'o', 'n', 'g', 'r', 'a', 'p', 'l', 'e', 'm', 'o', 'n', 'o', 'r', 'a', 'n', 'g', 'e', 'p', 'i', 'n', 'e', 'a', 'p', 'p', 'l', 'e', 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y']


In [96]:
# glom give list of elements by each partition
import sys
import random

def f(_):
    return random.random()

a = sc.parallelize(range(0,100),10)
#print(a.collect())
#print(a.glom().collect())
#print(a.map(f).glom().collect())

# Weird behavior: Initially, random numbers are synched across all workers, but will get 
# out-of-sync after a large (e.g, 1000000) number of random numbers have been generated.

In [97]:
# mapPartition and mapPartitionWithIndex
a = sc.parallelize(range(0,20),4)
print(a.glom().collect())

def f(it):
    s = 0
    l = []
    for i in it:
        s += i
        if s % 2 == 0:
            l.append(s)
    return l

print(a.mapPartitions(f).collect())

def f(it):
    s = 0
    for i in it:
        s += i
        yield s             #everything in yielded loop will \
                            #constricted into a output list
                            #the list not physically exist.
                            #recomand this one.
        

print(a.mapPartitions(f).collect())

def f(index, it):
    s = index
    for i in it:
        s += i
        yield s

print(a.mapPartitionsWithIndex(f).collect()) # it allows f have 2 parameters.

[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17, 18, 19]]
[0, 6, 10, 18, 26, 10, 46, 60, 48, 66]
[0, 1, 3, 6, 10, 5, 11, 18, 26, 35, 10, 21, 33, 46, 60, 15, 31, 48, 66, 85]
[0, 1, 3, 6, 10, 6, 12, 19, 27, 36, 12, 23, 35, 48, 62, 18, 34, 51, 69, 88]


## Actions 

In [20]:
# collect
fruitsArray = fruits.collect()
yellowThingsArray = yellowThings.collect()
# count
numFruits = fruits.count()
# take
first3Fruits = fruits.take(3)

In [34]:
# map reduce
letterSet = fruits.map(lambda fruit: set(fruit)).collect()
#print(letterSet)
letterSet = fruits.map(lambda fruit: set(fruit)).reduce(lambda x, y: x.union(y))
print(letterSet)

{'o', 'r', 'a', 'i', 'p', 'g', 'c', ' ', 'l', 'y', 'e', 'w', 'n', 'b', 'm', 't', 's'}


In [37]:
letterSet = fruits.flatMap(lambda fruit: list(fruit)).collect()
#print(letterSet)
letterSet = fruits.flatMap(lambda fruit: list(fruit)).distinct().collect()
print(letterSet)

['p', 'l', 'b', 'c', 'r', 'y', 'g', 'i', 's', 'a', 'e', 'n', ' ', 'm', 'o', 't', 'w']


In [98]:
# reduceByKey
numFruitsByLength = fruits.map(lambda fruit: (len(fruit), 1)).reduceByKey(lambda x, y: x + y)
print(numFruitsByLength.take(10))

[(6, 2), (12, 1), (4, 1), (10, 1), (5, 2), (9, 1)]


In [102]:
from operator import add

lines = sc.textFile('../data/course.txt')
#print(lines.take(1))
counts = lines.flatMap(lambda x: x.split()) \
              .map(lambda x: (x, 1)) \
              .reduceByKey(add)
print(counts.sortByKey().take(20))

[('Big', 1), ('Course', 2), ('Description', 1), ('Information', 1), ('Lecture', 1), ('This', 1), ('across', 1), ('amount', 1), ('and', 3), ('as', 1), ('both', 1), ('centers.', 1), ('cloud', 1), ('commodity', 1), ('computing', 1), ('course', 1), ('data', 4), ('emerge', 1), ('enabling', 1), ('even', 1)]


In [103]:
print(counts.sortBy(lambda x: x[1], False).take(20))

[('data', 4), ('of', 3), ('and', 3), ('Course', 2), ('in', 2), ('the', 2), ('Information', 1), ('systems,', 1), ('cloud', 1), ('parallel', 1), ('as', 1), ('mining', 1), ('massive', 1), ('amount', 1), ('even', 1), ('servers', 1), ('centers.', 1), ('both', 1), ('hands-on', 1), ('this', 1)]


In [104]:
# Join simple example

products = sc.parallelize([(1, "Apple"), (2, "Orange"), (3, "TV"), (5, "Computer")])
#trans = sc.parallelize([(1, 134, "OK"), (3, 34, "OK"), (5, 162, "Error"), (1, 135, "OK"), (2, 53, "OK"), (1, 45, "OK")])
trans = sc.parallelize([(1, (134, "OK")), (3, (34, "OK")), (5, (162, "Error")), (1, (135, "OK")), (2, (53, "OK")), (1, (45, "OK"))])

print(products.join(trans).take(20))

[(1, ('Apple', (134, 'OK'))), (1, ('Apple', (135, 'OK'))), (1, ('Apple', (45, 'OK'))), (2, ('Orange', (53, 'OK'))), (3, ('TV', (34, 'OK'))), (5, ('Computer', (162, 'Error')))]


## Join vs. Broadcast Variables

In [108]:
products = sc.parallelize([(1, "Apple"), (2, "Orange"), (3, "TV"), (5, "Computer")])
trans = sc.parallelize([(1, (134, "OK")), (3, (34, "OK")), (5, (162, "Error")), (1, (135, "OK")), (2, (53, "OK")), (1, (45, "OK"))])

print(trans.join(products).take(20))


[(1, ((134, 'OK'), 'Apple')), (1, ((135, 'OK'), 'Apple')), (1, ((45, 'OK'), 'Apple')), (2, ((53, 'OK'), 'Orange')), (3, ((34, 'OK'), 'TV')), (5, ((162, 'Error'), 'Computer'))]


In [109]:
products = {1: "Apple", 2: "Orange", 3: "TV", 5: "Computer"}
trans = sc.parallelize([(1, (134, "OK")), (3, (34, "OK")), (5, (162, "Error")), (1, (135, "OK")), (2, (53, "OK")), (1, (45, "OK"))])

broadcasted_products = sc.broadcast(products)

results = trans.map(lambda x: (x[0], broadcasted_products.value[x[0]], x[1]))
print(results.take(20))
results = trans.map(lambda x: (x[0], products[x[0]], x[1]))
print(results.take(20))


[(1, 'Apple', (134, 'OK')), (3, 'TV', (34, 'OK')), (5, 'Computer', (162, 'Error')), (1, 'Apple', (135, 'OK')), (2, 'Orange', (53, 'OK')), (1, 'Apple', (45, 'OK'))]
[(1, 'Apple', (134, 'OK')), (3, 'TV', (34, 'OK')), (5, 'Computer', (162, 'Error')), (1, 'Apple', (135, 'OK')), (2, 'Orange', (53, 'OK')), (1, 'Apple', (45, 'OK'))]


## Closure

A task’s closure is those variables and methods which must be visible for the executor to perform its computations on the RDD. (functions, global variables)

In [38]:
counter = 0
rdd = sc.parallelize(range(10))

# Wrong: Don't do this!!
def increment_counter(x):
    global counter
    counter += x

print(rdd.collect())
rdd.foreach(increment_counter)

print(counter)
print(rdd.sum())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
0
45


In [39]:
rdd = sc.parallelize(range(10))
accum = sc.accumulator(0)

def g(x):
    global accum
    accum += x

a = rdd.foreach(g)
print(accum.value)

45


In [45]:
rdd = sc.parallelize(range(10))
accum = sc.accumulator(0)

def g(x):
    global accum
    accum += x
    return x * x

a = rdd.map(g)
print(accum.value)
print(a.reduce(lambda x, y: x+y)) # 0+1+4+9+16+25+36+49+64+81, accum=45
a.cache()
tmp = a.count() # accum=45+45
print(accum.value)
print(rdd.reduce(lambda x, y: x+y))

tmp = a.count()
print(accum.value)
print(rdd.reduce(lambda x, y: x+y))

0
285
90
45
90
45


# PMI

In [1]:
import psutil
lines = sc.textFile('../data/adj_noun_pairs.txt', 8)
# Converting lines into word pairs. 
# Data is dirty: some lines have more than 2 words, so filter them out.
pairs = lines.map(lambda l: tuple(l.split())).filter(lambda p: len(p)==2)
pairs.cache()
N = pairs.count()
# Compute the frequency of each pair.
# Ignore pairs that not frequent enough.
pair_freqs = pairs.map(lambda p: (p,1)).reduceByKey(lambda f1, f2: f1 + f2) \
                  .filter(lambda pf: pf[1] >= 100)
# Computing the frequencies of the adjectives and the nouns.
a_freqs = pairs.map(lambda p: (p[0],1)).reduceByKey(lambda x,y: x+y)
n_freqs = pairs.map(lambda p: (p[1],1)).reduceByKey(lambda x,y: x+y)
# Broadcasting the adjective and noun frequencies. 
n_dict = sc.broadcast(n_freqs.collectAsMap())   #python's map data structure.
a_dict = sc.broadcast(a_freqs.collectAsMap())

In [2]:
from math import *

# Computing the PMI for a pair.
def pmi_score(pair_freq):
    w1, w2 = pair_freq[0]
    f = pair_freq[1]
    pmi = log(float(f)*N/(a_dict.value[w1]*n_dict.value[w2]), 2) #divided by N in nominator.
    return pmi, (w1, w2)

In [3]:
# Computing the PMI for all pairs.
scored_pairs = pair_freqs.map(pmi_score)
# Printing the most strongly associated pairs. 
scored_pairs.top(10)

[(14.41018838546462, ('magna', 'carta')),
 (13.071365888694997, ('polish-lithuanian', 'Commonwealth')),
 (12.990597616733414, ('nitrous', 'oxide')),
 (12.64972604311254, ('latter-day', 'Saints')),
 (12.50658937509916, ('stainless', 'steel')),
 (12.482331020687814, ('pave', 'runway')),
 (12.19140721768055, ('corporal', 'punishment')),
 (12.183248694293388, ('capital', 'punishment')),
 (12.147015483562537, ('rush', 'yard')),
 (12.109945794428935, ('globular', 'cluster'))]

# PageRank

In [5]:
import re
from operator import add

def computeContribs(urls, rank):
    # Calculates URL contributions to the rank of other URLs.
    num_urls = len(urls)
    for url in urls:
        yield (url, rank / num_urls)

def parseNeighbors(urls):
    # Parses a urls pair string into urls pair."""
    parts = urls.split(' ')
    return parts[0], parts[1]

# Loads in input file. It should be in format of:
#     URL         neighbor URL
#     URL         neighbor URL
#     URL         neighbor URL
#     ...

# The data file can be downloaded at http://www.cse.ust.hk/msbd5003/data/*
lines = sc.textFile("../data/pagerank_data.txt", 2)
# lines = sc.textFile("../data/dblp.in", 5)

numOfIterations = 10

# Loads all URLs from input file and initialize their neighbors. 
links = lines.map(lambda urls: parseNeighbors(urls)) \
             .groupByKey()

# Loads all URLs with other URL(s) link to from input file 
# and initialize ranks of them to one.
ranks = links.mapValues(lambda neighbors: 1.0)
# Calculates and updates URL ranks continuously using PageRank algorithm.
for iteration in range(numOfIterations):
    # Calculates URL contributions to the rank of other URLs.
    contribs = links.join(ranks) \
                    .flatMap(lambda url_urls_rank:
                             computeContribs(url_urls_rank[1][0],
                                             url_urls_rank[1][1]))
    # After the join, each element in the RDD is of the form
    # (url, (list of neighbor urls, rank))
    # Re-calculates URL ranks based on neighbor contributions.
    ranks = contribs.reduceByKey(add).mapValues(lambda rank: rank * 0.85 + 0.15)
    # ranks = contribs.reduceByKey(add).map(lambda t: (t[0], t[1] * 0.85 + 0.15))

print(ranks.top(5, lambda x: x[1]))

[('1', 1.2981882732854677), ('4', 0.9999999999999998), ('3', 0.9999999999999998), ('2', 0.7018117267145316)]


# Quiz

In [None]:
# Q1
from operator import add
lines = sc.textFile('README.md')
counts = lines.flatMap(lambda x: x.split()) \
              .map(lambda x: (x, 1)) \
              .reduceByKey(add)
# Add one line to find the most frequent word.
counts.max(lambda x:x[1])

In [None]:
# Q2
#Modify the word count example above, 
#so that we only count the frequencies of those words consisting of 5 or more characters.
counts = lines.flatMap(lambda x: x.split()) \
              .filter(lambda x: len(x)>=5) \
              .map(lambda x: (x, 1)) \
              .reduceByKey(add)  

In [111]:
# Q3
# What's its output? (Yes, you can just run it.)
A = sc.parallelize(range(1, 100))
t = 50
B = A.filter(lambda x: x < t)
print(B.count()) # 49
t = 10
C = B.filter(lambda x: x > t)
print(C.count()) # 0 because C = A.filter(lambda x: x < t).filter(lambda x: x > t)

49
0


In [117]:
# Q4
#The intent of the code above is to get all numbers below 50 from A and put them into B, 
#and then get all numbers above 10 from B and put them into C.  
#Fix the code so that it produces the desired behavior, by adding one line of code.  
#You are not allowed to change the existing code.
A = sc.parallelize(range(1, 100))
t = 50
B = A.filter(lambda x: x < t)
print(B.count()) # 49
B.cache()
t = 10
C = B.filter(lambda x: x > t)
print(C.count())

49
39


In [4]:
# Q5
#Modify the PMI example by sending a_dict and n_dict inside the closure. Do not use broadcast variables.
import psutil
lines = sc.textFile('../data/adj_noun_pairs.txt', 8)
pairs = lines.map(lambda l: tuple(l.split())).filter(lambda p: len(p)==2)
pairs.cache()
N = pairs.count()
a_freqs = pairs.map(lambda p: (p[0],1)).reduceByKey(lambda x,y: x+y)
n_freqs = pairs.map(lambda p: (p[1],1)).reduceByKey(lambda x,y: x+y)
pair_freqs = pairs.map(lambda p: (p,1)).reduceByKey(lambda f1, f2: f1 + f2) \
                  .filter(lambda pf: pf[1] >= 100)
a_dict = a_freqs.collectAsMap()
n_dict = n_freqs.collectAsMap()
def pmi_score(pair_freq):
    w1, w2 = pair_freq[0]
    f = pair_freq[1]
    pmi = log(float(f)*N/(a_dict[w1]*n_dict[w2]), 2) #divided by N in nominator.
    return pmi, (w1, w2)
scored_pairs = pair_freqs.map(pmi_score)

In [5]:
scored_pairs.top(10)

[(14.41018838546462, ('magna', 'carta')),
 (13.071365888694997, ('polish-lithuanian', 'Commonwealth')),
 (12.990597616733414, ('nitrous', 'oxide')),
 (12.64972604311254, ('latter-day', 'Saints')),
 (12.50658937509916, ('stainless', 'steel')),
 (12.482331020687814, ('pave', 'runway')),
 (12.19140721768055, ('corporal', 'punishment')),
 (12.183248694293388, ('capital', 'punishment')),
 (12.147015483562537, ('rush', 'yard')),
 (12.109945794428935, ('globular', 'cluster'))]

In [8]:
# Q6
#The following code creates an RDD with 4 partitions: partition 0, 1, 2, and 3.
A = sc.parallelize(range(100), 4)
#For each item in the RDD, add its partition number to it, and write the results to another RDD, i.e., 
#the resulting RDD should contain:
#[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 
#21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 
#41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 60, 
#61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 80, 
#81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102]
def f(x, sets):
    for i in sets:
        yield x+i
print(A.mapPartitionsWithIndex(f).collect())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102]
