Let's first insert some data in the HDFS

In [1]:
!hdfs dfs -mkdir -p /datasets
!wget -q http://www.gutenberg.org/cache/epub/100/pg100.txt \
    -O ../datasets/shakespeare_all.txt
!hdfs dfs -put -f ../datasets/shakespeare_all.txt /datasets/shakespeare_all.txt
!hdfs dfs -put -f ../datasets/hadoop_git_readme.txt /datasets/hadoop_git_readme.txt
!hdfs dfs -ls /datasets
!hdfs dfs -rm -r /tmp

Found 2 items
-rw-r--r--   1 vagrant supergroup       1365 2016-05-10 20:06 /datasets/hadoop_git_readme.txt
-rw-r--r--   1 vagrant supergroup    5589889 2016-05-10 20:06 /datasets/shakespeare_all.txt
16/05/10 20:06:36 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.
Deleted /tmp


## pySpark

In [2]:
sc._conf.getAll()

[(u'spark.rdd.compress', u'True'),
 (u'spark.master', u'yarn-client'),
 (u'spark.serializer.objectStreamReset', u'100'),
 (u'spark.yarn.isPython', u'true'),
 (u'spark.submit.deployMode', u'client'),
 (u'spark.executor.cores', u'2'),
 (u'spark.app.name', u'PySparkShell')]

In [3]:
numbers = range(10)
numbers_rdd = sc.parallelize(numbers)

numbers_rdd

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:423

In [4]:
numbers_rdd.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [5]:
numbers_rdd.take(4)

[0, 1, 2, 3]

In [6]:
sc.textFile("hdfs:///datasets/hadoop_git_readme.txt").first()

u'For the latest information about Hadoop, please visit our website at:'

In [7]:
sc.textFile("file:///home/vagrant/datasets/hadoop_git_readme.txt").first()

u'For the latest information about Hadoop, please visit our website at:'

In [8]:
numbers_rdd.saveAsTextFile("hdfs:///tmp/numbers_1_10.txt")

In [9]:
!hdfs dfs -ls /tmp/numbers_1_10.txt

Found 5 items
-rw-r--r--   1 vagrant supergroup          0 2016-05-10 20:06 /tmp/numbers_1_10.txt/_SUCCESS
-rw-r--r--   1 vagrant supergroup          4 2016-05-10 20:06 /tmp/numbers_1_10.txt/part-00000
-rw-r--r--   1 vagrant supergroup          4 2016-05-10 20:06 /tmp/numbers_1_10.txt/part-00001
-rw-r--r--   1 vagrant supergroup          4 2016-05-10 20:06 /tmp/numbers_1_10.txt/part-00002
-rw-r--r--   1 vagrant supergroup          8 2016-05-10 20:06 /tmp/numbers_1_10.txt/part-00003


In [10]:
numbers_rdd.coalesce(1).saveAsTextFile("hdfs:///tmp/numbers_1_10_one_file.txt")

In [11]:
!hdfs dfs -ls /tmp/numbers_1_10_one_file.txt

Found 2 items
-rw-r--r--   1 vagrant supergroup          0 2016-05-10 20:06 /tmp/numbers_1_10_one_file.txt/_SUCCESS
-rw-r--r--   1 vagrant supergroup         20 2016-05-10 20:06 /tmp/numbers_1_10_one_file.txt/part-00000


In [12]:
!hdfs dfs -cat /tmp/numbers_1_10_one_file.txt/part-00000

0
1
2
3
4
5
6
7
8
9


In [13]:
numbers_rdd.saveAsTextFile("file:///tmp/numbers_1_10.txt")

In [14]:
!ls /tmp/numbers_1_10.txt

part-00000  part-00001	part-00002  part-00003	_SUCCESS


In [15]:
def sq(x):
    return x**2

numbers_rdd.map(sq).collect()

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [16]:
numbers_rdd.map(lambda x: x**2).collect()

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [17]:
numbers_rdd.map(lambda x: x**2).reduce(lambda a,b: a+b)

285

In [18]:
numbers_rdd.map(lambda x: x**2).sum()

285

In [19]:
def tag(x):
    return "even" if x%2==0 else "odd"
        
    
numbers_rdd.map(lambda x: (tag(x), x) ).collect()

[('even', 0),
 ('odd', 1),
 ('even', 2),
 ('odd', 3),
 ('even', 4),
 ('odd', 5),
 ('even', 6),
 ('odd', 7),
 ('even', 8),
 ('odd', 9)]

In [20]:
numbers_rdd.map(lambda x: (tag(x), x) ).reduceByKey(lambda a,b: a+b).collect()

[('even', 20), ('odd', 25)]

In [21]:
def emit_feats(line):
    return [("chars", len(line)), \
            ("words", len(line.split())), \
            ("lines", 1)]

print (sc.textFile("/datasets/hadoop_git_readme.txt")
 .flatMap(emit_feats)
 .reduceByKey(lambda a,b: a+b)
 .collectAsMap())

{'chars': 1335, 'lines': 31, 'words': 179}


In [22]:
import re
WORD_RE = re.compile(r"[\w']+")

print (sc.textFile("/datasets/shakespeare_all.txt")
 .flatMap(lambda line: WORD_RE.findall(line))
 .map(lambda word: (word.lower(), 1))
 .reduceByKey(lambda a,b: a+b)
 .map(lambda (k,v): (v,k))
 .takeOrdered(1, key = lambda x: -x[0]))

[(27801, u'the')]


In [23]:
print (sc.textFile("/datasets/shakespeare_all.txt")
 .flatMap(lambda line: [(word.lower(), 1) for word in WORD_RE.findall(line)])
 .reduceByKey(lambda a,b: a+b)
 .takeOrdered(1, key = lambda x: -x[1]))

[(u'the', 27801)]
