## Reading CSV files using Spark

In [49]:
from pyspark.sql import SparkSession

In [51]:
spark = (
    SparkSession.builder \
        .appName("Reading CSV files") \
        .master("local[*]") \
        .getOrCreate()
)

In [53]:
spark

In [55]:
flight_df = spark.read.format("csv")\
    .option("header","true")\
    .option("inferschema","true")\
    .option("mode","FAILFAST")\
    .load("2010-summary.csv")

In [57]:
flight_df.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows


In [59]:
flight_df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



## Schema in Spark

In [68]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Define the schema properly as a StructType
my_schema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
    StructField("count", IntegerType(), True)
])

# Now use the properly defined schema
flight_df_schema = spark.read.format("csv")\
    .option("header","true")\
    .option("inferschema","true")\
    .schema(my_schema)\
    .option("mode","PERMISSIVE")\
    .load("2010-summary.csv")

In [72]:
flight_df_schema.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows
