
# Partitioning and Bucketing

#### While writing the dataframe to disc

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").appName("DF_Write").getOrCreate()     

In [0]:
df = spark.read.format("csv")\
                .option("header","true")\
                .option("inferschema", "true")\
                .option("mode","PERMISSIVE")\
                .load("/FileStore/schnario/emp_data2.csv")

df.show()

+---+--------+---+------+-------+------+
| id|    name|age|salary|address|gender|
+---+--------+---+------+-------+------+
|  1|  Manish| 26| 75000|  INDIA|     m|
|  2|  Nikita| 23|100000|    USA|     f|
|  3|  Pritam| 22|150000|  INDIA|     m|
|  4|Prantosh| 17|200000|  JAPAN|     m|
|  5|  Vikash| 31|300000|    USA|     m|
|  6|   Rahul| 55|300000|  INDIA|     m|
|  7|    Raju| 67|540000|    USA|     m|
|  8| Praveen| 28| 70000|  JAPAN|     m|
|  9|     Dev| 32|150000|  JAPAN|     m|
| 10|  Sherin| 16| 25000| RUSSIA|     f|
| 11|    Ragu| 12| 35000|  INDIA|     f|
| 12|   Sweta| 43|200000|  INDIA|     f|
| 13| Raushan| 48|650000|    USA|     m|
| 14|  Mukesh| 36| 95000| RUSSIA|     m|
| 15| Prakash| 52|750000|  INDIA|     m|
+---+--------+---+------+-------+------+



In [0]:
# Partitioning data while saving

df.write.format("csv")\
        .option("header","true")\
        .option("mode","overwrite")\
        .option("path","/FileStore/schnario/partition_by_address/")\
        .partitionBy("address")\
        .save()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-2088056966590499>:3[0m
[1;32m      1[0m [38;5;66;03m# Partitioning data while saving[39;00m
[0;32m----> 3[0m [43mdf[49m[38;5;241;43m.[39;49m[43mwrite[49m[38;5;241;43m.[39;49m[43mformat[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mcsv[39;49m[38;5;124;43m"[39;49m[43m)[49m[43m\[49m
[1;32m      4[0m [43m        [49m[38;5;241;43m.[39;49m[43moption[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mheader[39;49m[38;5;124;43m"[39;49m[43m,[49m[38;5;124;43m"[39;49m[38;5;124;43mtrue[39;49m[38;5;124;43m"[39;49m[43m)[49m[43m\[49m
[1;32m      5[0m [43m        [49m[38;5;241;43m.[39;49m[43moption[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mmode[39;49m[38;5;124;43m"[39;49m[43m,[49m[38;5;124;43m"[39;49m[38;5;124;43moverwrite[39;49m[38

In [0]:
dbutils.fs.ls("/FileStore/schnario/partition_by_address")

Out[4]: [FileInfo(path='dbfs:/FileStore/schnario/partition_by_address/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1694750169000),
 FileInfo(path='dbfs:/FileStore/schnario/partition_by_address/address=INDIA/', name='address=INDIA/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/schnario/partition_by_address/address=JAPAN/', name='address=JAPAN/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/schnario/partition_by_address/address=RUSSIA/', name='address=RUSSIA/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/schnario/partition_by_address/address=USA/', name='address=USA/', size=0, modificationTime=0)]

In [0]:
# Partitioning data using id column (It will create lots of small file)
# It will create problem in our data because of cardinility 
# This is the reason we use bucketing  insted of partitioning on some data

df.write.format("csv")\
        .option("header","true")\
        .option("mode","overwrite")\
        .option("path","/FileStore/schnario/partition_by_id/")\
        .partitionBy("id")\
        .save()

In [0]:
dbutils.fs.ls("/FileStore/schnario/partition_by_id/")

Out[5]: [FileInfo(path='dbfs:/FileStore/schnario/partition_by_id/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1694750360000),
 FileInfo(path='dbfs:/FileStore/schnario/partition_by_id/id=1/', name='id=1/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/schnario/partition_by_id/id=10/', name='id=10/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/schnario/partition_by_id/id=11/', name='id=11/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/schnario/partition_by_id/id=12/', name='id=12/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/schnario/partition_by_id/id=13/', name='id=13/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/schnario/partition_by_id/id=14/', name='id=14/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/schnario/partition_by_id/id=15/', name='id=15/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/schnario/partition_by_id/id=2/', name='id=2/', size=0, modificatio

In [0]:
# Partitioning data by using two columns

df.write.format("csv")\
        .option("header","true")\
        .option("mode","overwrite")\
        .option("path","/FileStore/schnario/partition_by_address_gender/")\
        .partitionBy("address","gender")\
        .save()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-2088056966590506>:3[0m
[1;32m      1[0m [38;5;66;03m# Partitioning data by using two columns[39;00m
[0;32m----> 3[0m [43mdf[49m[38;5;241;43m.[39;49m[43mwrite[49m[38;5;241;43m.[39;49m[43mformat[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mcsv[39;49m[38;5;124;43m"[39;49m[43m)[49m[43m\[49m
[1;32m      4[0m [43m        [49m[38;5;241;43m.[39;49m[43moption[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mheader[39;49m[38;5;124;43m"[39;49m[43m,[49m[38;5;124;43m"[39;49m[38;5;124;43mtrue[39;49m[38;5;124;43m"[39;49m[43m)[49m[43m\[49m
[1;32m      5[0m [43m        [49m[38;5;241;43m.[39;49m[43moption[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mmode[39;49m[38;5;124;43m"[39;49m[43m,[49m[38;5;124;43m"[39;49m[38;5;124;43moverwrite[39

In [0]:
dbutils.fs.ls("/FileStore/schnario/partition_by_address_gender/")

Out[7]: [FileInfo(path='dbfs:/FileStore/schnario/partition_by_address_gender/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1694750433000),
 FileInfo(path='dbfs:/FileStore/schnario/partition_by_address_gender/address=INDIA/', name='address=INDIA/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/schnario/partition_by_address_gender/address=JAPAN/', name='address=JAPAN/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/schnario/partition_by_address_gender/address=RUSSIA/', name='address=RUSSIA/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/schnario/partition_by_address_gender/address=USA/', name='address=USA/', size=0, modificationTime=0)]

In [0]:
dbutils.fs.ls("/FileStore/schnario/partition_by_address_gender/address=INDIA/")

Out[13]: [FileInfo(path='dbfs:/FileStore/schnario/partition_by_address_gender/address=INDIA/gender=f/', name='gender=f/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/schnario/partition_by_address_gender/address=INDIA/gender=m/', name='gender=m/', size=0, modificationTime=0)]

In [0]:
df.show()

+---+--------+---+------+-------+------+
| id|    name|age|salary|address|gender|
+---+--------+---+------+-------+------+
|  1|  Manish| 26| 75000|  INDIA|     m|
|  2|  Nikita| 23|100000|    USA|     f|
|  3|  Pritam| 22|150000|  INDIA|     m|
|  4|Prantosh| 17|200000|  JAPAN|     m|
|  5|  Vikash| 31|300000|    USA|     m|
|  6|   Rahul| 55|300000|  INDIA|     m|
|  7|    Raju| 67|540000|    USA|     m|
|  8| Praveen| 28| 70000|  JAPAN|     m|
|  9|     Dev| 32|150000|  JAPAN|     m|
| 10|  Sherin| 16| 25000| RUSSIA|     f|
| 11|    Ragu| 12| 35000|  INDIA|     f|
| 12|   Sweta| 43|200000|  INDIA|     f|
| 13| Raushan| 48|650000|    USA|     m|
| 14|  Mukesh| 36| 95000| RUSSIA|     m|
| 15| Prakash| 52|750000|  INDIA|     m|
+---+--------+---+------+-------+------+



In [0]:
# Storing data in a bucket

df.write.format("csv")\
    .option("header","true")\
    .option("mode","overwrite")\
    .option("path","/FileStore/schnario/bucket_by_id/")\
    .bucketBy(3,"id")\
    .saveAsTable("bucket_by_id_table")

In [0]:
dbutils.fs.ls("/FileStore/schnario/bucket_by_id/")

Out[8]: [FileInfo(path='dbfs:/FileStore/schnario/bucket_by_id/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1694751014000),
 FileInfo(path='dbfs:/FileStore/schnario/bucket_by_id/_committed_8009383254104127274', name='_committed_8009383254104127274', size=306, modificationTime=1694751013000),
 FileInfo(path='dbfs:/FileStore/schnario/bucket_by_id/_started_8009383254104127274', name='_started_8009383254104127274', size=0, modificationTime=1694751013000),
 FileInfo(path='dbfs:/FileStore/schnario/bucket_by_id/part-00000-tid-8009383254104127274-8cfdb991-c5fc-4884-8c32-307a1584fe45-21-1_00000.c000.csv', name='part-00000-tid-8009383254104127274-8cfdb991-c5fc-4884-8c32-307a1584fe45-21-1_00000.c000.csv', size=270, modificationTime=1694751013000),
 FileInfo(path='dbfs:/FileStore/schnario/bucket_by_id/part-00000-tid-8009383254104127274-8cfdb991-c5fc-4884-8c32-307a1584fe45-21-2_00001.c000.csv', name='part-00000-tid-8009383254104127274-8cfdb991-c5fc-4884-8c32-307a1584fe45-21-2_00001.c000.csv'