In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [2]:
spark.sparkContext.parallelize([1,2,3,4]).sum()

10

In [3]:
from pyspark.sql import Row

In [19]:
help(Row)

Help on class Row in module pyspark.sql.types:

class Row(__builtin__.tuple)
 |  A row in L{DataFrame}.
 |  The fields in it can be accessed:
 |  
 |  * like attributes (``row.key``)
 |  * like dictionary values (``row[key]``)
 |  
 |  ``key in row`` will search through row keys.
 |  
 |  Row can be used to create a row object by using named arguments,
 |  the fields will be sorted by names. It is not allowed to omit
 |  a named argument to represent the value is None or missing. This should be
 |  explicitly set to None in this case.
 |  
 |  >>> row = Row(name="Alice", age=11)
 |  >>> row
 |  Row(age=11, name='Alice')
 |  >>> row['name'], row['age']
 |  ('Alice', 11)
 |  >>> row.name, row.age
 |  ('Alice', 11)
 |  >>> 'name' in row
 |  True
 |  >>> 'wrong_key' in row
 |  False
 |  
 |  Row also can be used to create another Row like class, then it
 |  could be used to create Row objects, such as
 |  
 |  >>> Person = Row("name", "age")
 |  >>> Person
 |  <Row(name, age)>
 |  >>> 'nam

In [34]:
rows = [Row(id=num, name1=st, name2=st) for num, st in enumerate(["abc", "something", "abc", "excellent"])]
df = spark.createDataFrame(rows)

In [35]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name1: string (nullable = true)
 |-- name2: string (nullable = true)



In [36]:
df.show()

+---+---------+---------+
| id|    name1|    name2|
+---+---------+---------+
|  0|      abc|      abc|
|  1|something|something|
|  2|      abc|      abc|
|  3|excellent|excellent|
+---+---------+---------+



In [38]:
df.select("name1", "name2").distinct().count()

3

In [42]:
from pyspark.sql.functions import expr

In [44]:
df.withColumn("flag", expr("name1 == 'abc'")).show()

+---+---------+---------+-----+
| id|    name1|    name2| flag|
+---+---------+---------+-----+
|  0|      abc|      abc| true|
|  1|something|something|false|
|  2|      abc|      abc| true|
|  3|excellent|excellent|false|
+---+---------+---------+-----+



### https://docs.python.org/3.5/library/struct.html

### https://stackoverflow.com/questions/17958347/how-can-i-convert-a-python-urandom-to-a-string

In [71]:
import struct
import os
urandom_long = struct.unpack("l", os.urandom(8))[0]
spark.range(1000).sample(True, 0.01, urandom_long).show()

+---+
| id|
+---+
| 18|
| 52|
|392|
|453|
|532|
|757|
|921|
|957|
|986|
+---+



In [86]:
[d.count() for d in spark.range(100).randomSplit([0.75, 0.25, 0.25], 4)]

[60, 21, 19]

In [87]:
df1, df2, df3 = spark.range(100).randomSplit([0.75, 0.25, 0.25], 4)

In [90]:
print(df1.union(df2).count())
print(df3.count())

81
19


In [93]:
df.sort("name1", "id").show()

+---+---------+---------+
| id|    name1|    name2|
+---+---------+---------+
|  0|      abc|      abc|
|  2|      abc|      abc|
|  3|excellent|excellent|
|  1|something|something|
+---+---------+---------+



In [95]:
from pyspark.sql.functions import col
df.orderBy(col("name1"), col("id")).show()

+---+---------+---------+
| id|    name1|    name2|
+---+---------+---------+
|  0|      abc|      abc|
|  2|      abc|      abc|
|  3|excellent|excellent|
|  1|something|something|
+---+---------+---------+



['__class__',
 '__delattr__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__hash__',
 '__init__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__']

### https://txt.arboreus.com/2013/03/13/pretty-print-tables-in-python.html

In [96]:
from tabulate import tabulate

In [97]:
print(tabulate([["spam", 1], ["eggs", 42]]))

----  --
spam   1
eggs  42
----  --


In [99]:
print(tabulate([["spam", 1], ["eggs", 42]], ["item", "quantity"]))

item      quantity
------  ----------
spam             1
eggs            42


In [101]:
print(tabulate(df.collect(), df.columns))

  id  name1      name2
----  ---------  ---------
   0  abc        abc
   1  something  something
   2  abc        abc
   3  excellent  excellent


In [110]:
from pyspark.sql.functions import asc, desc
print(tabulate(df.sortWithinPartitions(asc("name1")).collect(), df.columns))
print("")
print(tabulate(df.sort(asc("name1")).collect(), df.columns))

  id  name1      name2
----  ---------  ---------
   0  abc        abc
   1  something  something
   2  abc        abc
   3  excellent  excellent

  id  name1      name2
----  ---------  ---------
   0  abc        abc
   2  abc        abc
   3  excellent  excellent
   1  something  something


In [115]:
spark.range(1000).sort(desc("id")).show()

+---+
| id|
+---+
|999|
|998|
|997|
|996|
|995|
|994|
|993|
|992|
|991|
|990|
|989|
|988|
|987|
|986|
|985|
|984|
|983|
|982|
|981|
|980|
+---+
only showing top 20 rows



In [118]:
spark.range(1000).sortWithinPartitions(expr("-id")).limit(3).show()

+---+
| id|
+---+
|249|
|248|
|247|
+---+



In [119]:
help(df.repartition)

Help on method repartition in module pyspark.sql.dataframe:

repartition(self, numPartitions, *cols) method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The
    resulting DataFrame is hash partitioned.
    
    ``numPartitions`` can be an int to specify the target number of partitions or a Column.
    If it is a Column, it will be used as the first partitioning column. If not specified,
    the default number of partitions is used.
    
    .. versionchanged:: 1.6
       Added optional arguments to specify the partitioning columns. Also made numPartitions
       optional if partitioning columns are specified.
    
    >>> df.repartition(10).rdd.getNumPartitions()
    10
    >>> data = df.union(df).repartition("age")
    >>> data.show()
    +---+-----+
    |age| name|
    +---+-----+
    |  5|  Bob|
    |  5|  Bob|
    |  2|Alice|
    |  2|Alice|
    +---+-----+
    >>> data = data.repartition(7, "age

In [120]:
dir(df)

['__class__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__hash__',
 '__init__',
 '__module__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_jcols',
 '_jdf',
 '_jmap',
 '_jseq',
 '_lazy_rdd',
 '_sc',
 '_schema',
 '_sort_cols',
 'agg',
 'alias',
 'approxQuantile',
 'cache',
 'checkpoint',
 'coalesce',
 'collect',
 'columns',
 'corr',
 'count',
 'cov',
 'createGlobalTempView',
 'createOrReplaceGlobalTempView',
 'createOrReplaceTempView',
 'createTempView',
 'crossJoin',
 'crosstab',
 'cube',
 'describe',
 'distinct',
 'drop',
 'dropDuplicates',
 'drop_duplicates',
 'dropna',
 'dtypes',
 'explain',
 'fillna',
 'filter',
 'first',
 'foreach',
 'foreachPartition',
 'freqItems',
 'groupBy',
 'groupby',
 'head',
 'hint',
 'intersect',
 'isLocal',
 'isStreaming',
 'is_cached',
 'join',
 'limit',
 'na',
 'orderBy',
 'persist',
 'prin

## https://stackoverflow.com/questions/35973590/pyspark-partioning-data-using-partitionby#35973860
Some resources claim the number of partitions should around twice as large as the number of available cores. From the other hand a single partition typically shouldn't contain more than 128MB and a single shuffle block cannot be larger than 2GB (See SPARK-6235).

In [154]:
from pprint import pprint
result = spark.range(25).repartition(expr("(id % 2) + 1")).coalesce(4).rdd.map(lambda r: r.id).glom().collect()
pprint(result)

[[],
 [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24],
 [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23],
 []]


In [156]:
x = spark.sparkContext.parallelize([("a", 1), ("b", 1), ("a", 2)])
def a(a):
    return [a]

def b(a, b):
    a += [b]
    return a

def c(a, b):
    a += b
    return a

x.combineByKey(a, b, c).collect()

[('a', [1, 2]), ('b', [1])]