# PySpark Read CSV File into DataFrame

In [0]:
df = spark.read.csv("dbfs:/FileStore/simple_zipcodes.csv")
df.printSchema()
df.show(4)

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)

+------------+-------+-------------------+-------+-----+
|         _c0|    _c1|                _c2|    _c3|  _c4|
+------------+-------+-------------------+-------+-----+
|RecordNumber|Country|               City|Zipcode|State|
|           1|     US|        PARC PARQUE|    704|   PR|
|           2|     US|PASEO COSTA DEL SUR|    704|   PR|
|          10|     US|       BDA SAN LUIS|    709|   PR|
+------------+-------+-------------------+-------+-----+
only showing top 4 rows



In [0]:
df = spark.read.format("csv").load("dbfs:/FileStore/simple_zipcodes.csv")
df.show(5)
df.printSchema()

+------------+-------+-------------------+-------+-----+
|         _c0|    _c1|                _c2|    _c3|  _c4|
+------------+-------+-------------------+-------+-----+
|RecordNumber|Country|               City|Zipcode|State|
|           1|     US|        PARC PARQUE|    704|   PR|
|           2|     US|PASEO COSTA DEL SUR|    704|   PR|
|          10|     US|       BDA SAN LUIS|    709|   PR|
|       49347|     US|               HOLT|  32564|   FL|
+------------+-------+-------------------+-------+-----+
only showing top 5 rows

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)



### Using Header Record For Column Names

In [0]:
df2 = spark.read.option("header",True).csv("dbfs:/FileStore/simple_zipcodes.csv")
df2.printSchema()
df2.show(5)

root
 |-- RecordNumber: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- State: string (nullable = true)

+------------+-------+-------------------+-------+-----+
|RecordNumber|Country|               City|Zipcode|State|
+------------+-------+-------------------+-------+-----+
|           1|     US|        PARC PARQUE|    704|   PR|
|           2|     US|PASEO COSTA DEL SUR|    704|   PR|
|          10|     US|       BDA SAN LUIS|    709|   PR|
|       49347|     US|               HOLT|  32564|   FL|
|       49348|     US|          HOMOSASSA|  34487|   FL|
+------------+-------+-------------------+-------+-----+
only showing top 5 rows



In [0]:
df2.count()

Out[25]: 20

### Read Multiple CSV Files

In [0]:
file_path1 = "dbfs:/FileStore/zipFolder/simple_zipcodes.csv"
file_path2 = "dbfs:/FileStore/zipFolder/simple_zipcodes1.csv"

df3 = spark.read.option("header",True).csv([file_path1, file_path2])

df3.show(5)
df3.printSchema()

+------------+-------+-------------------+-------+-----+
|RecordNumber|Country|               City|Zipcode|State|
+------------+-------+-------------------+-------+-----+
|           1|     UK|        PARC PARQUE|    704|   PR|
|           2|     UK|PASEO COSTA DEL SUR|    704|   PR|
|          10|     UK|       BDA SAN LUIS|    709|   PR|
|       49347|     UK|               HOLT|  32564|   FL|
|       49348|     UK|          HOMOSASSA|  34487|   FL|
+------------+-------+-------------------+-------+-----+
only showing top 5 rows

root
 |-- RecordNumber: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- State: string (nullable = true)



In [0]:
df3.count()

Out[35]: 40

### Read all CSV Files in a Directory

In [0]:
df4 = spark.read.option("header",True).csv("dbfs:/FileStore/zipFolder")
df4.printSchema()
df4.show(5)

root
 |-- RecordNumber: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- State: string (nullable = true)

+------------+-------+-------------------+-------+-----+
|RecordNumber|Country|               City|Zipcode|State|
+------------+-------+-------------------+-------+-----+
|           1|     UK|        PARC PARQUE|    704|   PR|
|           2|     UK|PASEO COSTA DEL SUR|    704|   PR|
|          10|     UK|       BDA SAN LUIS|    709|   PR|
|       49347|     UK|               HOLT|  32564|   FL|
|       49348|     UK|          HOMOSASSA|  34487|   FL|
+------------+-------+-------------------+-------+-----+
only showing top 5 rows



### Options While Reading CSV File

In [0]:
df5 = spark.read.options(delimiter=',').csv("dbfs:/FileStore/zipFolder/simple_zipcodes.csv")
df5.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)



In [0]:
df6 = spark.read.options(header=True,inferSchema=True,delimiter=',').csv("dbfs:/FileStore/zipFolder/simple_zipcodes.csv")       #OR
#df6 = spark.read.option("header",True).option("inferSchema",True).options(delimiter=',').csv("dbfs:/FileStore/zipFolder/simple_zipcodes.csv")
df6.printSchema()

root
 |-- RecordNumber: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- State: string (nullable = true)



###Reading CSV files with a user-specified custom schema

In [0]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType
schema_1 = StructType([
    StructField("RecordNumber",IntegerType(),True),
    StructField("Country",StringType(),True),
    StructField("City",StringType(),True),
    StructField("Zipcode",IntegerType(),True),
    StructField("State",StringType(),True)
])

df_1 = spark.read.format('csv').option("header",True).schema(schema_1).load("dbfs:/FileStore/zipFolder/simple_zipcodes.csv")
df_1.printSchema()

root
 |-- RecordNumber: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- State: string (nullable = true)



#Write PySpark DataFrame to CSV file

In [0]:
df.write.option("header",True).csv("dbfs:/FileStore/zipFolder/zipedFol/")

In [0]:
#With delimiter Option
df.write.options(header=True,delimiter="|").csv("dbfs:/FileStore/zipFolder/zipedFol_2/")

## Saving modes:
overwrite – mode is used to overwrite the existing file.

append – To add the data to the existing file.

ignore – Ignores write operation when the file already exists.

error – This is a default option when the file already exists, it returns an error.

In [0]:
df.write.mode("overwrite").csv("dbfs:/FileStore/zipFolder/zipedFol_3/")

In [0]:
df.write.format("csv").mode("overwrite").save("dbfs:/FileStore/zipFolder/zipedFol_4")