In [78]:
# Import the needed library
from pyspark import SparkConf, SparkContext

In [79]:
#Define the context
sc =  sc = SparkContext.getOrCreate();

In [80]:
# Take the text file as an input. R1 is the RDD created. 
r1 = sc.textFile("input.txt")
r1

input.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [81]:
# R1 is executed and will show thw count of lines in the file
r1.count()

17

In [82]:
# first() method will print first line from RDD
r1.first()


'The hair and tortoise plan a race. '

In [83]:
# take(n) will print the first n lines from RDD
r1.take(2)

['The hair and tortoise plan a race. ',
 'All the animals come to watch the race. ']

In [84]:
# Return a new RDD containing only the elements that satisfy a predicate.
hairlines = r1.filter(lambda line : 'hair' in line)
hairlines.count()

7

In [85]:
tortoiselines = r1.filter(lambda line : 'tortoise' in line)
tortoiselines.count()

7

In [86]:
# Return the union of this RDD and another one.
r2 = tortoiselines.union(hairlines)
r2.count()

14

In [87]:
for line in r2.take(14):
    print(line)

The hair and tortoise plan a race. 
The tortoise is slow.
The tortoise is behind. 
The hair sees the tortoise behind. 
The tortoise walks slowly and reaches the end. 
The tortoise wins the race. 
He sees that tortoise has own the race.
The hair and tortoise plan a race. 
The hair runs fast. 
The hair is ahead. 
The hair sees the tortoise behind. 
The hair sleeps. 
The hair is still sleeping. 
The hair gets up. 


In [88]:
# Return the intersection of this RDD and another one. 
#The output will not contain any duplicate elements, even if the input RDDs did.
#This method performs a shuffle internally.
r3 = tortoiselines.intersection(hairlines)
r3.count()

2

In [89]:
for line in r3.take(2):
    print(line)

The hair sees the tortoise behind. 
The hair and tortoise plan a race. 


In [90]:
#Distribute a local Python collection to form an RDD. 
#Using range is recommended if the input represents a range for performance
nums = sc.parallelize([1, 2,3, 4])

In [91]:
#Return a new RDD by applying a function to each element of this RDD.
squared = nums.map(lambda x:x*x).collect()

In [92]:
for n in squared:
    print(n)

1
4
9
16


In [93]:
#Distribute a local Python collection to form an RDD. 
#Using range is recommended if the input represents a range for performance.
lines= sc.parallelize(["hi", "ho are you"])

In [94]:
#Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.
words = lines.flatMap(lambda line:line.split(" "))

In [95]:
words.count()

4

In [96]:
for word in words.take(4):
    print(word)

hi
ho
are
you


In [97]:
words.first()

'hi'

In [98]:
words.collect()

['hi', 'ho', 'are', 'you']

In [99]:
rdd = sc.parallelize([2, 3, 4])
rangeRdd = rdd.flatMap(lambda x: range(1, x))
rangeRdd.collect()

[1, 1, 2, 1, 2, 3]

In [100]:
sorted(rangeRdd.collect())

[1, 1, 1, 2, 2, 3]

In [101]:
nums = sc.parallelize([1, 2, 3, 4])
nums.count()


4

In [102]:
#Reduces the elements of this RDD using the specified commutative and associative binary operator. 
#Currently reduces partitions locally.
nums = nums.reduce(lambda x,y:x + y)
nums

10

In [103]:
from operator import add
sc.parallelize([1, 2, 3, 4, 5]).reduce(add)

15

In [104]:
# Stop the Spark program
sc.stop()