In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("Create Dataframe") \
                    .getOrCreate()

In [None]:
spark

In [None]:
customer_df = spark.read.format("csv").load("./Input/2_Customer_Master_Data.csv")

In [None]:
customer_df.printSchema()

In [None]:
customer_df.show()

In [None]:
customer_header_df = spark.read.format("csv").option("header",True).load("./Input/2_Customer_Master_Data.csv")

In [None]:
customer_header_df.printSchema()

In [None]:
customer_header_df.show()

In [None]:
customer_header_schema_df = spark.read.format("csv").option("header",True).option("inferSchema",True).load("./Input/2_Customer_Master_Data.csv")

In [None]:
customer_header_schema_df.printSchema()

In [None]:
customer_header_schema_df.show()

In [None]:
customer_schema = """
                    CustomerID integer,
                    CustomerName string,
                    CustomerBucket string,
                    CustomerRegion string,
                    LessThanFullTruckLoad string,
                    YearOnYearVolumeIncrease string
                """

In [None]:
customer_header_schema_df = spark.read.format("csv") \
                                 .option("header",True) \
                                .schema(customer_schema) \
                                .load("./Input/2_Customer_Master_Data.csv")

In [None]:
customer_header_schema_df.printSchema()

In [None]:
customer_header_schema_df.show()

In [None]:
# Mode
# https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option

# 1. PERMISSIVE  - Defualt - Bad records into a column _corrupt_record. 
                #   we can rename the column using otion ( columnNameOfCorruptRecord )
# 2. DROPMALFORMED - will drop bad records while reading
# 3. FAILFAST - job will fail if there are any bad records. 
                # it will as soon as it hits the bad record

# Note : Mode will work only incase where schema has beed specified

In [None]:
sales_csv_df = spark.read.format("csv") \
                        .option("header",True) \
                         .load("./Input/1_sales_orders_bad_records.csv")

In [None]:
sales_csv_df.show()

In [None]:
sales_schema = """SalesOrder string, 
                    OrderDate string, 
                    CustomerID integer, 
                    City string, 
                    Country string, 
                    ProductID integer, 
                    Price integer, 
                    Qty_Sold integer, 
                    Qty_Sold_Units string, 
                    Amount integer, 
                    Amount_Currency string, 
                    ShipDate string,
                    OrderStatus string
                    """

In [None]:
sales_csv_df = spark.read.format("csv") \
                        .option("header",True) \
                        .schema(sales_schema) \
                         .load("./Input/1_sales_orders_bad_records.csv")

In [None]:
sales_csv_df.show()

In [None]:
sales_schema_new = """SalesOrder string, 
                    OrderDate string, 
                    CustomerID integer, 
                    City string, 
                    Country string, 
                    ProductID integer, 
                    Price integer, 
                    Qty_Sold integer, 
                    Qty_Sold_Units string, 
                    Amount integer, 
                    Amount_Currency string, 
                    ShipDate string,
                    OrderStatus string,
                    bad_record string
                    """

In [None]:
sales_csv_c_df1 = spark.read.format("csv") \
                        .option("header",True) \
                        .schema(sales_schema_new) \
                        .option("mode", "PERMISSIVE") \
                        .option("columnNameOfCorruptRecord","bad_record") \
                         .load("./Input/1_sales_orders_bad_records.csv")

In [None]:
sales_csv_c_df1.printSchema()

In [None]:
sales_csv_c_df1.show()

In [None]:
sales_schema_new = """SalesOrder string, 
                    OrderDate string, 
                    CustomerID integer, 
                    City string, 
                    Country string, 
                    ProductID integer, 
                    Price integer, 
                    Qty_Sold integer, 
                    Qty_Sold_Units string, 
                    Amount integer, 
                    Amount_Currency string, 
                    ShipDate string,
                    OrderStatus string
                    """

In [None]:
sales_csv_c_df1 = spark.read.format("csv") \
                        .option("header",True) \
                        .schema(sales_schema_new) \
                        .option("mode", "DROPMALFORMED") \
                         .load("./Input/1_sales_orders_bad_records.csv")

In [None]:
sales_csv_c_df1.printSchema()

In [None]:
sales_csv_c_df1.show()

In [None]:
sales_csv_c_df1 = spark.read.format("csv") \
                        .option("header",True) \
                        .schema(sales_schema_new) \
                        .option("mode", "FAILFAST") \
                         .load("./Input/1_sales_orders_bad_records.csv")

In [None]:
sales_csv_c_df1.show()

In [None]:
# option dictionary

options = {"header": "true",
           "inferSchema":"true",
           "mode":"PERMISSIVE" }

In [None]:
sales_csv_c_df1 = spark.read.format("csv") \
                        .options(**options) \
                         .load("./Input/1_sales_orders_bad_records.csv")

In [None]:
sales_csv_c_df1.show()