In [1]:
# Create Spark Session
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Factor of cores") \
    .master("local[*]") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.memory", "1G") \
    .config("spark.driver.memory", "4G") \
    .getOrCreate()
spark

In [None]:
spark

In [2]:
sales_schema = """SalesOrder string, 
                    OrderDate string, 
                    CustomerName string, 
                    City string, 
                    Country string, 
                    Product string, 
                    Price string, 
                    Qty_Sold string, 
                    Qty_Sold_Units string, 
                    Amount integer, 
                    Amount_Currency string, 
                    ShipDate string"""

In [4]:
sales_df = spark.read.format("csv").option("header",True).schema(sales_schema).load("./Input/1_sales_orders.csv")

In [5]:
sales_df.show()

+----------+----------+------------+---------+-------+-------+-----+--------+--------------+------+---------------+----------+
|SalesOrder| OrderDate|CustomerName|     City|Country|Product|Price|Qty_Sold|Qty_Sold_Units|Amount|Amount_Currency|  ShipDate|
+----------+----------+------------+---------+-------+-------+-----+--------+--------------+------+---------------+----------+
|     SO101|08-01-2024|           1|Hyderabad|  India|      1| 3000|       4|        pieces| 12000|            INR|18-01-2024|
|     SO102|19-01-2024|           2|   Mumbai|  India|      2| 1500|       5|        pieces|  7500|            INR|29-01-2024|
|     SO103|01-01-2024|           3|     Pune|  India|      3| 1500|      10|        pieces| 15000|            INR|11-01-2024|
|     SO104|30-03-2024|           2|     Pune|  India|      4|10000|       8|        pieces| 80000|            INR|09-04-2024|
|     SO105|14-03-2024|           4|Ahmedabad|  India|      5| 8000|      10|        pieces| 80000|            

In [6]:
sales_df.createTempView("sales_view")

In [7]:
spark.sql(""" select *, Amount * 0.10 as amountaftertax 
            from sales_view """).show()

+----------+----------+------------+---------+-------+-------+-----+--------+--------------+------+---------------+----------+--------------+
|SalesOrder| OrderDate|CustomerName|     City|Country|Product|Price|Qty_Sold|Qty_Sold_Units|Amount|Amount_Currency|  ShipDate|amountaftertax|
+----------+----------+------------+---------+-------+-------+-----+--------+--------------+------+---------------+----------+--------------+
|     SO101|08-01-2024|           1|Hyderabad|  India|      1| 3000|       4|        pieces| 12000|            INR|18-01-2024|       1200.00|
|     SO102|19-01-2024|           2|   Mumbai|  India|      2| 1500|       5|        pieces|  7500|            INR|29-01-2024|        750.00|
|     SO103|01-01-2024|           3|     Pune|  India|      3| 1500|      10|        pieces| 15000|            INR|11-01-2024|       1500.00|
|     SO104|30-03-2024|           2|     Pune|  India|      4|10000|       8|        pieces| 80000|            INR|09-04-2024|       8000.00|
|     

In [9]:
from pyspark.sql.functions import expr
sales_df.withColumn("Amountaftertax", expr("Amount * 0.10") ).show()

+----------+----------+------------+---------+-------+-------+-----+--------+--------------+------+---------------+----------+--------------+
|SalesOrder| OrderDate|CustomerName|     City|Country|Product|Price|Qty_Sold|Qty_Sold_Units|Amount|Amount_Currency|  ShipDate|Amountaftertax|
+----------+----------+------------+---------+-------+-------+-----+--------+--------------+------+---------------+----------+--------------+
|     SO101|08-01-2024|           1|Hyderabad|  India|      1| 3000|       4|        pieces| 12000|            INR|18-01-2024|       1200.00|
|     SO102|19-01-2024|           2|   Mumbai|  India|      2| 1500|       5|        pieces|  7500|            INR|29-01-2024|        750.00|
|     SO103|01-01-2024|           3|     Pune|  India|      3| 1500|      10|        pieces| 15000|            INR|11-01-2024|       1500.00|
|     SO104|30-03-2024|           2|     Pune|  India|      4|10000|       8|        pieces| 80000|            INR|09-04-2024|       8000.00|
|     

In [10]:
def sales_after_tax(amount):
    return (amount + (amount * 0.10))

In [15]:
import time 
def sales_after_tax(amount):
    time.sleep(10)
    return (amount + (amount * 0.10))

In [11]:
from pyspark.sql.functions import udf

sales_tax_udf = udf(sales_after_tax)

In [12]:


# from pyspark.sql.functions import expr
# sales_df.withColumn("Amountaftertax", expr("Amount * 0.10") ).show()
sales_df.withColumn("Amountaftertax",sales_tax_udf("Amount")).show()

+----------+----------+------------+---------+-------+-------+-----+--------+--------------+------+---------------+----------+--------------+
|SalesOrder| OrderDate|CustomerName|     City|Country|Product|Price|Qty_Sold|Qty_Sold_Units|Amount|Amount_Currency|  ShipDate|Amountaftertax|
+----------+----------+------------+---------+-------+-------+-----+--------+--------------+------+---------------+----------+--------------+
|     SO101|08-01-2024|           1|Hyderabad|  India|      1| 3000|       4|        pieces| 12000|            INR|18-01-2024|       13200.0|
|     SO102|19-01-2024|           2|   Mumbai|  India|      2| 1500|       5|        pieces|  7500|            INR|29-01-2024|        8250.0|
|     SO103|01-01-2024|           3|     Pune|  India|      3| 1500|      10|        pieces| 15000|            INR|11-01-2024|       16500.0|
|     SO104|30-03-2024|           2|     Pune|  India|      4|10000|       8|        pieces| 80000|            INR|09-04-2024|       88000.0|
|     

In [16]:
# spark sql
from pyspark.sql.functions import udf

spark.udf.register("_fn_cal_amt_after_tax",sales_after_tax, "double")


<function __main__.sales_after_tax(amount)>

In [17]:
spark.sql(""" select *, _fn_cal_amt_after_tax(Amount) as amountaftertax 
            from sales_view """).show()

+----------+----------+------------+---------+-------+-------+-----+--------+--------------+------+---------------+----------+--------------+
|SalesOrder| OrderDate|CustomerName|     City|Country|Product|Price|Qty_Sold|Qty_Sold_Units|Amount|Amount_Currency|  ShipDate|amountaftertax|
+----------+----------+------------+---------+-------+-------+-----+--------+--------------+------+---------------+----------+--------------+
|     SO101|08-01-2024|           1|Hyderabad|  India|      1| 3000|       4|        pieces| 12000|            INR|18-01-2024|       13200.0|
|     SO102|19-01-2024|           2|   Mumbai|  India|      2| 1500|       5|        pieces|  7500|            INR|29-01-2024|        8250.0|
|     SO103|01-01-2024|           3|     Pune|  India|      3| 1500|      10|        pieces| 15000|            INR|11-01-2024|       16500.0|
|     SO104|30-03-2024|           2|     Pune|  India|      4|10000|       8|        pieces| 80000|            INR|09-04-2024|       88000.0|
|     

In [None]:
sales_df.createOrReplaceTempView("sales_view")

In [None]:
spark.sql("select * from sales_view").show(truncate=False)

In [None]:
spark.sql("select *,sales_tax_function(Amount) as AmountAfterTax from sales_view").show(truncate=False)

In [18]:
spark.stop()