## Repartition and Coalesce

In [2]:
from pyspark.sql import SparkSession

In [17]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [12]:
spark = (
    SparkSession
    .builder \
    .appName("RepartitionandCoalesce") \
    .master("local[*]")\
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/18 17:27:44 WARN Utils: Your hostname, Shrees-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 10.183.253.103 instead (on interface en0)
25/11/18 17:27:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/18 17:27:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [14]:
spark

In [21]:
flight_df = spark.read.format("csv")\
            .option("header",True)\
            .option("inferschema",True)\
            .load("2010-summary.csv")

flight_df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

In [23]:
flight_df.count()

255

## Repartition

In [31]:
flight_df.rdd.getNumPartitions()

1

In [35]:
partitioned_flight_df = flight_df.repartition(4)

In [43]:
partitioned_flight_df.withColumn("partitionId",spark_partition_id()).groupBy("partitionId").count().show()

+-----------+-----+
|partitionId|count|
+-----------+-----+
|          0|   63|
|          1|   64|
|          2|   64|
|          3|   64|
+-----------+-----+



In [45]:
## Repartition by the name of Column 
partition_on_col = flight_df.repartition(300,"ORIGIN_COUNTRY_NAME")

In [47]:
partition_on_col.rdd.getNumPartitions()

300

In [51]:
partition_on_col.withColumn("partitionId",spark_partition_id()).groupBy("partitionId").count().show()

+-----------+-----+
|partitionId|count|
+-----------+-----+
|          0|    1|
|          2|    1|
|          7|    1|
|         10|    1|
|         13|    1|
|         15|    2|
|         16|    2|
|         19|    1|
|         21|    1|
|         22|    1|
|         28|    1|
|         31|    1|
|         39|    1|
|         42|    1|
|         43|    1|
|         44|    1|
|         45|    2|
|         48|    1|
|         53|    1|
|         54|    1|
+-----------+-----+
only showing top 20 rows


## Coalesce

In [57]:
coalesce_df = flight_df.repartition(8)

In [61]:
coalesce_df.withColumn("partitionId",spark_partition_id()).groupBy("partitionId").count().show()

+-----------+-----+
|partitionId|count|
+-----------+-----+
|          0|   32|
|          1|   31|
|          2|   32|
|          3|   32|
|          4|   32|
|          5|   32|
|          6|   32|
|          7|   32|
+-----------+-----+



In [63]:
three_coalesce_df = coalesce_df.coalesce(3)

In [65]:
three_coalesce_df.withColumn("partitionId",spark_partition_id()).groupBy("partitionId").count().show()

+-----------+-----+
|partitionId|count|
+-----------+-----+
|          0|   64|
|          1|   95|
|          2|   96|
+-----------+-----+



In [69]:
three_repartition_df = coalesce_df.repartition(3)

In [71]:
three_repartition_df.withColumn("partitionId",spark_partition_id()).groupBy("partitionId").count().show()

+-----------+-----+
|partitionId|count|
+-----------+-----+
|          0|   85|
|          1|   85|
|          2|   85|
+-----------+-----+

