In [22]:
student_name = "Chris Boesch" # Chris Boesch

"""
Update the student_name variable above with your name. 
Extend this notebook to solve the problems at the bottom. 
From the menue Cell -> Run All
Save your notebook. 
Download your notebook. 
Create a Github Gist of your notebook. 
Submit a link to your gist in ClassMentors. 

"""

import datetime
now = datetime.datetime.now()

message = "{} ran all at {}".format(student_name, now)
print(message)

Chris Boesch ran all at 2015-12-05 11:15:44.957667


In [2]:
from pyspark import SparkContext, SparkConf
conf = SparkConf()
sc = SparkContext(conf=conf)



RDD's are immutable. You can't change them once they are created. 

You can transform them and create new RDD's, but you can't change the original.

This property enables Spark to partition the data and move partitions around. 

Programmers can define how many many partitions there should be to an RDD. 

If you do not provide a value, one is chosen automatically. 

You transform RDDs to others RDDs. 


In [3]:
rdd = sc.parallelize([1,2,3,4,5])

#reduce(a,b)
#take(n)
#collect()
#takeOrdered(n,func())

# Make sure that the resulting list will fit into memory.
r0 = rdd.collect()
print(r0)

# The lambda code gets passed to the workers. 
#Reduce is a terminal function
r1 = rdd.reduce(lambda x, y: x*y)
print(r1)


r2 = rdd.map(lambda x: x*2).collect()

print(r2)

r3 = rdd.filter(lambda x: x%2==0).take(2)
print(r3)

rdd2 = sc.parallelize([2,2,1,1,1,1,2,2,3,3,3])
r4 = rdd2.distinct().takeOrdered(2)

r5 = rdd.flatMap(lambda x: [x,x*x, x*x*x]).collect()
print(r5)

rdd3 = sc.parallelize([5,2,3,4,1,6])
# takeOrdered() allows you to pass a function to define the sort value.
r6 = rdd3.takeOrdered(2, lambda x: -1*x)
print(r6)

[1, 2, 3, 4, 5]
120
[2, 4, 6, 8, 10]
[2, 4]
[1, 1, 1, 2, 4, 8, 3, 9, 27, 4, 16, 64, 5, 25, 125]
[6, 5]


In [4]:
# We will create a local text file for testing.
data_file = open("data.txt", "w")
data_file.write("this is a test\n")
data_file.write("to see what happens\n")
data_file.write("with text data\n")
data_file.close()


In [5]:
# textFile() will load text data as one string per line of data. 
data = sc.textFile("data.txt")
print(data.count())
print("")
print(data.collect())
print("")

# YOu can split the lines by any separator. 
words = data.map(lambda line: line.split(" ")).collect()
print(words)
print("")

flat_words = data.flatMap(lambda line: line.split(" ")) \
                 .collect()
    
print(flat_words)



3

['this is a test', 'to see what happens', 'with text data']

[['this', 'is', 'a', 'test'], ['to', 'see', 'what', 'happens'], ['with', 'text', 'data']]

['this', 'is', 'a', 'test', 'to', 'see', 'what', 'happens', 'with', 'text', 'data']


In [6]:
# You need to cache if you want to avoid duplicate loading. 
data = sc.textFile("data.txt")
#data.cache()

# Load data from file.  
linecount = data.count()
# Load data from file again if not cached. 
wordcount = data.flatMap(lambda line: line.split(" ")) \
                .count()

print("The line count was {} and the wordcount was {}." \
      .format(linecount, wordcount))


The line count was 3 and the wordcount was 11.


In [7]:
"""
reduceByKey(func)
sortByKey()
groupByKey()

"""
rdd = sc.parallelize([(1,10), (3,20),(1,30) ])
rdd.sortByKey().collect()

[(1, 10), (1, 30), (3, 20)]

In [8]:
# groupByKey()
rdd = sc.parallelize([(1,10), (3,20),(1,30) ])
# This can cause problems if all the values for a key are large. 
rdd.groupByKey().collect()

[(1, <pyspark.resultiterable.ResultIterable at 0x7f7ceaeb1f28>),
 (3, <pyspark.resultiterable.ResultIterable at 0x7f7ceaeb1c50>)]

In [9]:
# reduceByKey()
rdd = sc.parallelize([(1,10), (3,20),(1,30) ])
rdd.reduceByKey(lambda x,y: x + y).collect()

[(1, 40), (3, 20)]

In [10]:
# Count how many of each card type were present. 
rdd = sc.parallelize([("Hearts",2), 
                      ("Clubs", 8),
                      ("Hearts",3),
                      ("Hearts",5),
                      ("Clubs", 4) ])
result = rdd.map(lambda x: (x[0], 1)) \
            .reduceByKey(lambda x,y: x+y) \
            .collect()
    
for item in result: 
    message = "There were {} {}.".format(item[1], item[0])
    print(message)

There were 2 Clubs.
There were 3 Hearts.


In [11]:
rdd = sc.parallelize([("Hearts",2), 
                      ("Clubs", 8),
                      ("Hearts",3),
                      ("Hearts",5),
                      ("Clubs", 4) ])
# Put each element in an list so that you can add the lists together. 
#[1] + [2,3] = [1,2,3]
result = rdd.map(lambda x: (x[0], [x[1]]))  \
            .reduceByKey(lambda x,y: x + y) \
            .collect()
        
for item in result: 
    message = "The {} found were {}".format(item[0], item[1])
    print(message)
    


The Clubs found were [8, 4]
The Hearts found were [2, 3, 5]


In [12]:
# Sometime develoers will wrap expression in ( ) rather than use / between line breaks. 
# Either approach will work depending on which you find more readable. 

result = (rdd.map(lambda x: (x[0], [x[1]]))
            .reduceByKey(lambda x,y: x + y) 
            .collect())
        
for item in result: 
    message = "The {} found were {}".format(item[0], item[1])
    print(message)

The Clubs found were [8, 4]
The Hearts found were [2, 3, 5]


In [13]:
# You can define the number of partitions in an RDD

# Reminder: range(4) -> [0,1,2,3]
numPartitions = 5
rdd = sc.parallelize(range(100), numPartitions )
print( "The rdd has {} partitions.".format(rdd.getNumPartitions()))

# If you don't pass the number of partitions, Spark will pick it for you. 
rdd = sc.parallelize(range(100))
print( "The rdd has {} partitions.".format(rdd.getNumPartitions()))


The rdd has 5 partitions.
The rdd has 24 partitions.


In [14]:
# Here is a csv data example. 
data_file = open("data2.csv", "w")
data_file.write("1,2,3\n")
data_file.write("4,5,6\n")
data_file.write("7,8,9\n")
data_file.write("4,11,12\n")
data_file.close()

csvdata = sc.textFile("data2.csv")

# Load data from file.  
linecount = csvdata.count()
print(linecount)

# Turn the list of strings into a list of lists
rows = csvdata.map(lambda line: line.split(","))

print(rows.collect())



4
[['1', '2', '3'], ['4', '5', '6'], ['7', '8', '9'], ['4', '11', '12']]


In [15]:
# Once we have the rdd in a list of rows format, we can do additional operations. 

# Add up the second values in each row after converting them to integers. 
result = (rows.map(lambda x: x[1]) 
             .map(lambda x: int(x)) 
             .reduce(lambda x, y: x+y) )

print(result)

26


In [16]:
# Return the sum of the 2nd and 3rd value using the first value as a key
result = (rows.map(lambda x: (x[0], int(x[1]) + int(x[2]) ) ) 
             .collect() )
print(result)

[('1', 5), ('4', 11), ('7', 17), ('4', 23)]


In [17]:
# Repeat but group by key summing the values for rows with the same 1st value. 
result = (rows.map(lambda x: (x[0], int(x[1]) + int(x[2]) ) ) 
              .reduceByKey(lambda x, y: x+y) 
              .sortByKey()
              .collect())
print(result)

[('1', 5), ('4', 34), ('7', 17)]


In [18]:
"""
Given this csv data, use Spark to answer the following questions. 
Write your code so that it would scale and work for ten million rows as well as for thew few rows provided. 

Gru has been assigning his minions to help deliver packages during the holidays. 
Gru has a csv file containing millions of records of minion deliveries. 

The csv data is in this format. 
minion, quantity, kilometers, minutes


The first column contains the unique name of the minion. 
The second column contains the number of packages delivered on a trip. 
The third column contains the distance traveled on a delivery trip. 
The fourth column contains the time in minutes needed to complete the trip. 

"""
rawdata = """Bob,5,2,3
Kevin,8,5,9
Bob,5,6,7
Stuart,4,5,6
Kevin,1,5,5
"""
data_file = open("data3.csv", "w")
data_file.write(rawdata)
data_file.close()

# Load data from file.  
data = sc.textFile("data3.csv")
print(data.collect())


['Bob,5,2,3', 'Kevin,8,5,9', 'Bob,5,6,7', 'Stuart,4,5,6', 'Kevin,1,5,5']


In [21]:
"""
Write the spark code to answer the following questions using the data RDD defined above. 

Q1 - How many minions are there records for?
Q2 - How many kilometers did each minon travel in total? 
Q3 - How many total kilometers were travelled by all minions overall? 
Q4 - What was the total kilometers traveled and total minutes traveled for each minion? 
Q5 - How many items were delivered per minute by each minion? 

"""


q1 = data.map(lambda x: x.split(",")).map(lambda x: x[0]).distinct().count()

q2 = data.map(lambda x: x.split(",")) \
         .map(lambda x: (x[0], int(x[2]))) \
         .reduceByKey(lambda x,y: x+y) \
         .collect()
            
q3 = data.map(lambda x: x.split(",")).map(lambda x: int(x[2])).sum()

q4 = (data.map(lambda x: x.split(","))
          .map(lambda x: (x[0], (int(x[2]), int(x[3]))))
          .reduceByKey(lambda x, y: (x[0]+y[0],x[1]+y[1]))
          .collect() 
     )

q5_almost = (data.map(lambda x: x.split(","))
          .map(lambda x: (x[0], (float(x[1])/ float(x[3]))))
          .reduceByKey(lambda x, y: x+y)
          .collect()
      )

q5 = (data.map(lambda x: x.split(","))
          .map(lambda x: (x[0], (float(x[1]), float(x[3]) )))
          .reduceByKey(lambda x, y: ( x[0]+y[0], x[1]+y[1] ))
          .map(lambda x: (x[0], x[1][0]/x[1][1] ))
          .collect()
      )

print("Q1 = {}".format(q1))
print("Q2 = {}".format(q2))
print("Q3 = {}".format(q3))
print("Q4 = {}".format(q4))

#Non-weighted - just adding up the packages per minute. 
# What if one row was for a lot more packages and minutes? 
print("Q5 Close but not quite = {}".format(q5_almost))

# Correctly 
print("Q5 = {}".format(q5))



Q1 = 3
Q2 = [('Kevin', 10), ('Bob', 8), ('Stuart', 5)]
Q3 = 23
Q4 = [('Kevin', (10, 14)), ('Bob', (8, 10)), ('Stuart', (5, 6))]
Q5 Close but not quite = [('Kevin', 1.0888888888888888), ('Bob', 2.380952380952381), ('Stuart', 0.6666666666666666)]
Q5 = [('Kevin', 0.6428571428571429), ('Bob', 1.0), ('Stuart', 0.6666666666666666)]
