In [1]:
import sys
sys.version

'3.6.1 |Anaconda 4.4.0 (64-bit)| (default, May 11 2017, 13:09:58) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]'

In [2]:
sc

In [6]:
rdd = sc.parallelize(range(0, 1000))

Ensure sys.version and sc.pythonVer are reasonably compatible. In our example, we're working with Python 3.6.

In [7]:
sc.pythonVer

'3.6'

The easiest sanity check is to make sure we can do a simple reduce.

In [8]:
rdd.reduce(lambda x, y: x + y)

499500

Now we're going to be more ambitious and make sure that we're getting actual nodes. We'll use our friend, ``socket.gethostname()`` to do this magic.

In [9]:
import socket


What you see here may vary. Cobalt allocates whatever nodes it wants. So your output will likely differ.

In [10]:
socket.gethostname()

'cc040'

In [11]:
rdd

PythonRDD[4] at RDD at PythonRDD.scala:48

Here we map each element to the hostname. We use a list to make it easy to do a reduce of list + list, which will eventually be turned into a set. This is not efficient but allows us to use the commutative list concatenation operator, which place nicely with map/reduce thinking.

This is a demo of how to run PySpark using Python notebooks based on notes prepared by Shilpika, Venkat, and George.

See https://docs.google.com/document/d/1PHGLAbDOZzdiCnoWA6PUOTzt7Lrc_UGq5CZmQj1qW5A/edit?usp=sharing for details.

In [12]:
rdd_hostnames = rdd.map(lambda id: [socket.gethostname()])

In [13]:
hosts_used = rdd_hostnames.reduce(lambda x, y: x + y)

We use the Python set to take all list items and remove duplicates.

In [14]:
unique_hosts = set(hosts_used)


In [15]:
print(unique_hosts)

{'cc040', 'cc118', 'cc100', 'cc117'}


Functional programming idioms behind map/reduce concept

In [16]:
integers = range(0, 10)
squares = list(map(lambda x: x * x, integers))

In [17]:
print(squares)

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]


In [18]:
even_squares = list(filter(lambda x: x % 2 == 0, squares))
print(even_squares)

[0, 4, 16, 36, 64]


In [19]:
from functools import reduce
int_sum = reduce(lambda x, y: x + y, integers)
print(int_sum)

45


Python supports many ideas from functional programming beyond map, reduce, and filter. Sometimes these are scattered among libraries ``functools`` and ``itertools``. 

In [20]:
def int_generator():
    i = 0
    while True:
        yield i
        i += 1

In [21]:
from itertools import takewhile

infinite_integers = int_generator()
ints_to_100 = takewhile(lambda x: x < 100, infinite_integers)


In [23]:
import os
import os.path


In [26]:
dirname = os.path.join("/scratch", "100000")
print(dirname)

/scratch/100000


In [27]:
os.makedirs(dirname)

In [28]:
os.listdir("/scratch")

['100000', 'lost+found']

In [43]:
def create_dir_id(id):
    id = str(id)
    dirname = os.path.join("/scratch", id)
    if not os.path.exists(dirname): os.makedirs(dirname)
    return dirname

In [44]:
create_dir_id(1000000)

'/scratch/1000000'

In [45]:
os.listdir("/scratch")


['542',
 '311',
 '461',
 '55',
 '553',
 '377',
 '807',
 '626',
 '637',
 '132',
 '472',
 '632',
 '306',
 '297',
 '474',
 '543',
 '52',
 '383',
 '561',
 '470',
 '221',
 '641',
 '552',
 '644',
 '135',
 '219',
 '305',
 '217',
 '710',
 '45',
 '882',
 '208',
 '891',
 '42',
 '386',
 '476',
 '793',
 '640',
 '973',
 '888',
 '627',
 '714',
 '961',
 '880',
 '223',
 '963',
 '884',
 '41',
 '391',
 '709',
 '642',
 '209',
 '43',
 '798',
 '465',
 '60',
 '878',
 '128',
 '137',
 '376',
 '393',
 '725',
 '559',
 '213',
 '549',
 '375',
 '643',
 '803',
 '978',
 '716',
 '811',
 '894',
 '390',
 '469',
 '293',
 '974',
 '958',
 '797',
 '131',
 '877',
 '303',
 '629',
 '291',
 '129',
 '806',
 '143',
 '61',
 '210',
 '968',
 '555',
 '726',
 '140',
 '385',
 '130',
 '890',
 '876',
 '473',
 '127',
 '967',
 '51',
 '394',
 '388',
 '975',
 '708',
 '301',
 '125',
 '464',
 '545',
 '796',
 '100000',
 '554',
 '141',
 '56',
 '308',
 '541',
 '384',
 '46',
 '50',
 '809',
 '639',
 '970',
 '550',
 '959',
 '546',
 '225',
 '295',
 

In [48]:
rdd2 = rdd.map(lambda dir_id: create_dir_id(dir_id))
rdd2.cache()
rdd3 = rdd2.map(lambda pathname: [ '%s:%s' % (socket.gethostname(),  pathname) ]  )
result = rdd3.reduce(lambda x, y: x + y)

In [49]:
result

['cc118:/scratch/0',
 'cc118:/scratch/1',
 'cc118:/scratch/2',
 'cc118:/scratch/3',
 'cc118:/scratch/4',
 'cc118:/scratch/5',
 'cc118:/scratch/6',
 'cc118:/scratch/7',
 'cc118:/scratch/8',
 'cc118:/scratch/9',
 'cc118:/scratch/10',
 'cc118:/scratch/11',
 'cc118:/scratch/12',
 'cc118:/scratch/13',
 'cc118:/scratch/14',
 'cc118:/scratch/15',
 'cc118:/scratch/16',
 'cc118:/scratch/17',
 'cc118:/scratch/18',
 'cc118:/scratch/19',
 'cc117:/scratch/20',
 'cc117:/scratch/21',
 'cc117:/scratch/22',
 'cc117:/scratch/23',
 'cc117:/scratch/24',
 'cc117:/scratch/25',
 'cc117:/scratch/26',
 'cc117:/scratch/27',
 'cc117:/scratch/28',
 'cc117:/scratch/29',
 'cc117:/scratch/30',
 'cc117:/scratch/31',
 'cc117:/scratch/32',
 'cc117:/scratch/33',
 'cc117:/scratch/34',
 'cc117:/scratch/35',
 'cc117:/scratch/36',
 'cc117:/scratch/37',
 'cc117:/scratch/38',
 'cc117:/scratch/39',
 'cc117:/scratch/40',
 'cc040:/scratch/41',
 'cc040:/scratch/42',
 'cc040:/scratch/43',
 'cc040:/scratch/44',
 'cc040:/scratch/45'

In [55]:
def touch_file(pathname):
    datafile = os.path.join(pathname, "data.txt")
    os.system("touch %s" % datafile)
    return datafile


In [56]:
touch_file("/scratch/1000000")

'/scratch/1000000/data.txt'

In [57]:
os.listdir("/scratch/1000000")

['data.txt']

In [58]:
rdd4 = rdd2.map(lambda pathname: [ touch_file(pathname) ])


In [59]:
result = rdd4.reduce(lambda x, y: x + y)
result

['/scratch/0/data.txt',
 '/scratch/1/data.txt',
 '/scratch/2/data.txt',
 '/scratch/3/data.txt',
 '/scratch/4/data.txt',
 '/scratch/5/data.txt',
 '/scratch/6/data.txt',
 '/scratch/7/data.txt',
 '/scratch/8/data.txt',
 '/scratch/9/data.txt',
 '/scratch/10/data.txt',
 '/scratch/11/data.txt',
 '/scratch/12/data.txt',
 '/scratch/13/data.txt',
 '/scratch/14/data.txt',
 '/scratch/15/data.txt',
 '/scratch/16/data.txt',
 '/scratch/17/data.txt',
 '/scratch/18/data.txt',
 '/scratch/19/data.txt',
 '/scratch/20/data.txt',
 '/scratch/21/data.txt',
 '/scratch/22/data.txt',
 '/scratch/23/data.txt',
 '/scratch/24/data.txt',
 '/scratch/25/data.txt',
 '/scratch/26/data.txt',
 '/scratch/27/data.txt',
 '/scratch/28/data.txt',
 '/scratch/29/data.txt',
 '/scratch/30/data.txt',
 '/scratch/31/data.txt',
 '/scratch/32/data.txt',
 '/scratch/33/data.txt',
 '/scratch/34/data.txt',
 '/scratch/35/data.txt',
 '/scratch/36/data.txt',
 '/scratch/37/data.txt',
 '/scratch/38/data.txt',
 '/scratch/39/data.txt',
 '/scratch

We can verify that the operation worked by running `os.listdir()` on the driver (to see what files were created by executor running on this same node.

In [60]:
os.listdir("/scratch")

['542',
 '311',
 '461',
 '55',
 '553',
 '377',
 '807',
 '626',
 '637',
 '132',
 '472',
 '632',
 '306',
 '297',
 '474',
 '543',
 '52',
 '383',
 '561',
 '470',
 '221',
 '641',
 '552',
 '644',
 '135',
 '219',
 '305',
 '217',
 '710',
 '45',
 '882',
 '208',
 '891',
 '42',
 '386',
 '476',
 '793',
 '640',
 '973',
 '888',
 '627',
 '714',
 '961',
 '880',
 '223',
 '963',
 '884',
 '41',
 '391',
 '709',
 '642',
 '209',
 '43',
 '798',
 '465',
 '60',
 '878',
 '128',
 '137',
 '376',
 '393',
 '725',
 '559',
 '213',
 '549',
 '375',
 '643',
 '803',
 '978',
 '716',
 '811',
 '894',
 '390',
 '469',
 '293',
 '974',
 '958',
 '797',
 '131',
 '877',
 '303',
 '629',
 '291',
 '129',
 '806',
 '143',
 '61',
 '210',
 '968',
 '555',
 '726',
 '140',
 '385',
 '130',
 '890',
 '876',
 '473',
 '127',
 '967',
 '51',
 '394',
 '388',
 '975',
 '708',
 '301',
 '125',
 '464',
 '545',
 '796',
 '100000',
 '554',
 '141',
 '56',
 '308',
 '541',
 '384',
 '46',
 '50',
 '809',
 '639',
 '970',
 '550',
 '959',
 '546',
 '225',
 '295',
 

In [61]:
os.listdir("/scratch/386")

['data.txt']