In [1]:
import pyspark

In [2]:
from pyspark import SparkConf, SparkContext, SparkFiles

## SparkConf

In [2]:
pyspark.SparkConf?

[0;31mInit signature:[0m [0mpyspark[0m[0;34m.[0m[0mSparkConf[0m[0;34m([0m[0mloadDefaults[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0m_jvm[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0m_jconf[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Configuration for a Spark application. Used to set various Spark
parameters as key-value pairs.

Most of the time, you would create a SparkConf object with
C{SparkConf()}, which will load values from C{spark.*} Java system
properties as well. In this case, any parameters you set directly on
the C{SparkConf} object take priority over system properties.

For unit tests, you can also call C{SparkConf(false)} to skip
loading external settings and get the same configuration no matter
what the system properties are.

All setter methods in this class support chaining. For example,
you can write C{conf.setMaster("local").setAppName("My app")}.

.. note:: Once a SparkConf object is passed to Spark,

set(key, value) − To set a configuration property.

setMaster(value) − To set the master URL.

setAppName(value) − To set an application name.

get(key, defaultValue=None) − To get a configuration value of a key.

setSparkHome(value) − To set Spark installation path on worker nodes.

In [3]:
# 将SparkConf对象传递给Apache Spark
conf = pyspark.SparkConf().setAppName('test').setMaster('local')  # spark://master:7077

In [4]:
sc = SparkContext(conf=conf)

## SparkFiles

In [6]:
file_name = "test_file"
sc.addFile(file_name)  # upload your files

In [8]:
SparkFiles.get("test_file")

'/tmp/spark-5cbd022d-96d8-40a8-9ce9-18b30287bee6/userFiles-6ba940ba-ae16-478a-951b-78978f5a3183/test_file'

In [9]:
SparkFiles.getRootDirectory()

'/tmp/spark-5cbd022d-96d8-40a8-9ce9-18b30287bee6/userFiles-6ba940ba-ae16-478a-951b-78978f5a3183'

## Serializers

Serialization is used for performance tuning on Apache Spark.  
All data that is sent over the network or written to the disk or persisted in the memory should be serialized.

- MarshalSerializer: Serializes objects using Python’s Marshal Serializer. This serializer is faster than PickleSerializer, but supports fewer datatypes

- Serializes objects using Python’s Pickle Serializer. This serializer supports nearly any Python object, but may not be as fast as more specialized serializers.

In [None]:
sc.stop()

In [10]:
pyspark.PickleSerializer?

[0;31mInit signature:[0m [0mpyspark[0m[0;34m.[0m[0mPickleSerializer[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Serializes objects using Python's pickle serializer:

    http://docs.python.org/2/library/pickle.html

This serializer supports nearly any Python object, but may
not be as fast as more specialized serializers.
[0;31mFile:[0m           /usr/local/spark/python/pyspark/serializers.py
[0;31mType:[0m           type
[0;31mSubclasses:[0m     CloudPickleSerializer


In [11]:
pyspark.MarshalSerializer?

[0;31mInit signature:[0m [0mpyspark[0m[0;34m.[0m[0mMarshalSerializer[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Serializes objects using Python's Marshal serializer:

    http://docs.python.org/2/library/marshal.html

This serializer is faster than PickleSerializer but supports fewer datatypes.
[0;31mFile:[0m           /usr/local/spark/python/pyspark/serializers.py
[0;31mType:[0m           type
[0;31mSubclasses:[0m     


In [1]:
from pyspark import SparkContext
from pyspark.serializers import MarshalSerializer

sc = SparkContext('local', 'serializer app', serializer=MarshalSerializer())

In [2]:
sc.parallelize(list(range(1000))).map(lambda x: x**2).take(10)

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [3]:
sc.stop()

'local'