In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("Create Dataframe") \
                    .getOrCreate()

In [None]:
spark

In [None]:
import os

# Specify the relative path
relative_path = "2_Customer_Master_Data.csv"

# Get the absolute path
absolute_path = os.path.abspath(relative_path)
print("Absolute Path:", absolute_path)

In [None]:
customer_df = spark.read.format("csv") \
                        .load("./Input/2_Customer_Master_Data.csv")

In [None]:
customer_df.printSchema()

In [None]:
customer_df.show()

In [None]:
customer_df = spark.read.format("csv") \
                        .option("header",True) \
                        .option("inferSchema", True ) \
                        .load("./Input/2_Customer_Master_Data.csv")

In [None]:
customer_df.printSchema()

In [None]:
customer_df.show()

In [None]:
customer_schema = """
                    CustomerID integer,
                    CustomerName string,
                    CustomerBucket string,
                    CustomerRegion string,
                    LessThanFullTruckLoad string,
                    YearOnYearVolumeIncrease string
                """

In [None]:
customer_df = spark.read.format("csv") \
                        .option("header",True) \
                        .schema(customer_schema) \
                        .load("./Input/2_Customer_Master_Data.csv")

In [None]:
customer_df.printSchema()

In [None]:
customer_df.show()

In [None]:
# Mode
# https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option

# 1. PERMISSIVE  - Defualt - Bad records into a column _corrupt_record. 
                #   we can rename the column using otion ( columnNameOfCorruptRecord )
# 2. DROPMALFORMED - will drop bad records while reading
# 3. FAILFAST - job will fail if there are any bad records. 
                # it will as soon as it hits the bad record

# Note : Mode will work only incase where schema has beed specified

In [None]:
sales_schema = """SalesOrder string, 
                    OrderDate string, 
                    CustomerID integer, 
                    City string, 
                    Country string, 
                    ProductID integer, 
                    Price integer, 
                    Qty_Sold integer, 
                    Qty_Sold_Units string, 
                    Amount integer, 
                    Amount_Currency string, 
                    ShipDate string,
                    OrderStatus string,
                    bad_records string
                    """

In [None]:
sales_bad_df = spark.read.format("csv") \
                        .option("header",True) \
                        .schema(sales_schema) \
                        .option("mode", "PERMISSIVE" ) \
                        .option("columnNameOfCorruptRecord", "bad_records") \
                        .load("./Input/1_sales_orders_bad_records.csv")

In [None]:
sales_bad_df.show()

In [None]:
sales_bad_df.where("bad_records is not null").show(truncate=False)                                                           

In [None]:
sales_schema = """SalesOrder string, 
                    OrderDate string, 
                    CustomerID integer, 
                    City string, 
                    Country string, 
                    ProductID integer, 
                    Price integer, 
                    Qty_Sold integer, 
                    Qty_Sold_Units string, 
                    Amount integer, 
                    Amount_Currency string, 
                    ShipDate string,
                    OrderStatus string
                    """

In [None]:
sales_bad_df = spark.read.format("csv") \
                        .option("header",True) \
                        .schema(sales_schema) \
                        .option("mode", "DROPMALFORMED" ) \
                        .load("./Input/1_sales_orders_bad_records.csv")

In [None]:
sales_bad_df.show()

In [None]:
sales_bad_df = spark.read.format("csv") \
                        .option("header",True) \
                        .schema(sales_schema) \
                        .option("mode", "FAILFAST" ) \
                        .load("./Input/1_sales_orders_bad_records.csv")

In [None]:
sales_bad_df.show()

In [None]:
csv_options = {"header" : "true",
               "inferSchema" : "true",
               "sep" : ","} 

In [None]:
sales_bad_df = spark.read.format("csv") \
                        .options(**csv_options) \
                        .load("./Input/1_sales_orders_bad_records.csv")

In [None]:
sales_bad_df.show()