In [92]:
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as F
from pyspark.sql.types import StringType

In [2]:
spark = SparkSession.builder.appName("deo").getOrCreate()

In [3]:
spark

# **DataFrame Creation and Inspection**

In [28]:
# 1.1 Load the CSV using Pandas, PySpark, and Dask.

df = spark.read.csv("/content/drive/MyDrive/Datasets/Copy of Sales_Dataset__500_Records_.csv", inferSchema=True, header=True)

In [29]:
# 1.2 Display the first 5 and last 5 records.
df.show(5)
spark.createDataFrame(df.tail(5)).show()

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|       Cash|   2022

In [30]:
# 1.3 Print schema and check data types.
df.printSchema()

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- ProductCategory: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- DeliveryStatus: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- City: string (nullable = true)
 |-- PaymentMode: string (nullable = true)
 |-- CustomerSince: date (nullable = true)



# **Selection, Renaming, and Filtering**

In [31]:
# 2.1 Select only OrderID , CustomerName , and Amount
df.select(["OrderID", "CustomerName", "Amount"]).show()

+-------+------------------+------+
|OrderID|      CustomerName|Amount|
+-------+------------------+------+
|   2824|     Donald Walker|783.04|
|   7912|      Brandon Hall| 905.0|
|   4611|      Donald Booth|657.96|
|   3547|    Phillip Garcia|606.89|
|   8527|      Valerie Gray| 77.87|
|   4150|       Amber Perez|352.37|
|   5554|        Roy Martin|148.33|
|   2169|    Carolyn Daniel| 14.09|
|   6313|       Patty Perez| 79.83|
|   6155|Jonathan Wilkerson|882.68|
|   9830|       Kevin Hurst|870.55|
|   9085| Anthony Rodriguez|921.73|
|   2040|     Kyle Mcdonald|327.52|
|   6573|    Jeffrey Chavez|676.02|
|   2743|  Elizabeth Fowler| 47.06|
|   9837|     Tammy Sellers| 46.15|
|   6038|     David Bradley|348.51|
|   3060|       John Pierce|362.09|
|   4295|   Jennifer Powers|684.26|
|   5061|    George Chapman|251.89|
+-------+------------------+------+
only showing top 20 rows



In [32]:
# 2.2 Rename Amount to OrderAmount
df = df.withColumnRenamed("Amount", "OrderAmount")
df.show()

+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|
+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|
|   3547|    Phillip Garcia|        Fashion|     606.89|2024-03-24|      Returned|    0.15| West Melanieview|     Wallet|   2020-08-08|
|   8527|      Valerie Gray|           Toys|    

In [33]:
# 2.3 Filter orders where Amount > 500.
df.filter(df.OrderAmount > 500).show()

+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|
+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|
|   3547|    Phillip Garcia|        Fashion|     606.89|2024-03-24|      Returned|    0.15| West Melanieview|     Wallet|   2020-08-08|
|   6155|Jonathan Wilkerson|        Fashion|    

In [34]:
# 2.4 Filter orders from a specific city using .query() or .filter().
df.filter(df.City == 'Lammouth').show()

+-------+---------------+---------------+-----------+----------+--------------+--------+--------+-----------+-------------+
|OrderID|   CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|    City|PaymentMode|CustomerSince|
+-------+---------------+---------------+-----------+----------+--------------+--------+--------+-----------+-------------+
|   4295|Jennifer Powers|    Electronics|     684.26|2024-03-19|     Cancelled|    0.21|Lammouth|        UPI|   2020-12-06|
+-------+---------------+---------------+-----------+----------+--------------+--------+--------+-----------+-------------+



# **Data Manipulation**

In [35]:
# 3.1 Drop CustomerSince column.
df.drop("CustomerSince").show()

+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|
+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|
|   3547|    Phillip Garcia|        Fashion|     606.89|2024-03-24|      Returned|    0.15| West Melanieview|     Wallet|
|   8527|      Valerie Gray|           Toys|      77.87|2024-08-04|     Delivered|    0.17|        Mariastad|       Cash|
|   4150|       Amber Pe

In [37]:
# 3.2 Add a new column FinalAmount = Amount - (Amount * Discount)
df = df.withColumn("FinalAmount", F.round(df["OrderAmount"] - (df["OrderAmount"] * df["Discount"]), 2))
df.show()

+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+-----------+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|FinalAmount|
+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+-----------+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|     665.58|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|     877.85|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|     651.38|
|   3547|    Phillip Garcia|        Fashion|     606.89|2024-03-24|      Returned|    0.15| West Melanieview|   

In [39]:
# 3.3 Sort by FinalAmount descending.
df.sort("FinalAmount", ascending=False).show()

+-------+-----------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+-----------+
|OrderID|     CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|FinalAmount|
+-------+-----------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+-----------+
|   5573|   Jordan Frazier|          Books|     981.05|2025-03-19|     Cancelled|    0.02|      Sheilaville|       Cash|   2021-07-12|     961.43|
|   8474|      Heidi Brown|    Electronics|     968.91|2023-11-23|     Cancelled|    0.02|       Riverafort|       Cash|   2023-03-19|     949.53|
|   8889|      Karen Garza|          Books|      998.3|2024-10-17|     Cancelled|    0.06|       Johnsonton|Credit Card|   2020-12-17|      938.4|
|   2127|     Jaclyn Moore|      Groceries|     933.32|2025-03-11|      Returned|    0.01|      Cherylhaven|       Cas

In [42]:
# 3.4 Replace all “Cancelled” status with “Order Cancelled”.
df.replace("Cancelled", "Order Cancelled", subset="DeliveryStatus").show()

+-------+------------------+---------------+-----------+----------+---------------+--------+-----------------+-----------+-------------+-----------+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate| DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|FinalAmount|
+-------+------------------+---------------+-----------+----------+---------------+--------+-----------------+-----------+-------------+-----------+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|       Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|     665.58|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|Order Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|     877.85|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|       Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|     651.38|
|   3547|    Phillip Garcia|        Fashion|     606.89|2024-03-24|       Returned|    0.15| West Melaniev

# **Aggregations and GroupBy**

In [43]:
# 4.1 Count of orders by DeliveryStatus.
df.groupBy("DeliveryStatus").agg(F.count("OrderID").alias("OrderCounts")).show()

+--------------+-----------+
|DeliveryStatus|OrderCounts|
+--------------+-----------+
|      Returned|        117|
|     Cancelled|        149|
|     Delivered|        119|
|       Pending|        115|
+--------------+-----------+



In [46]:
# 4.2 Average Amount by ProductCategory.
df.groupBy("ProductCategory").agg(F.round(F.avg("OrderAmount"), 2).alias("Avergage")).show()

+---------------+--------+
|ProductCategory|Avergage|
+---------------+--------+
|        Fashion|  500.63|
|      Groceries|  459.52|
|    Electronics|  551.75|
|          Books|   568.6|
|           Toys|  534.28|
+---------------+--------+



In [47]:
# 4.3 Group by City and show total sales.
df.groupby("City").agg(F.round(F.sum("OrderAmount"), 2).alias("Total Sales")).show()

+----------------+-----------+
|            City|Total Sales|
+----------------+-----------+
|     Ramseymouth|     761.06|
|East Edwardshire|     291.26|
|      Thomasberg|     882.68|
|     Laurenville|     383.26|
| South Colinstad|     786.27|
|    Lake Douglas|     975.09|
|   Williamsmouth|      10.78|
|      Gordonport|     514.99|
|  West Dawnmouth|       12.8|
|        Seanbury|     814.39|
|     Sheilaville|     981.05|
|       Mollybury|     222.02|
|       Lisaville|      45.69|
| Lake Jerrymouth|     404.01|
|       Perezfort|     917.55|
|Port Nicoleshire|     133.78|
|  South Samantha|     229.46|
|     Port Willie|     788.13|
|     Waltersfort|     552.81|
|       Youngbury|     372.95|
+----------------+-----------+
only showing top 20 rows



# **Null Handling and Update**

In [56]:
# 5.1 Intentionally inject nulls in City column and handle them using fillna() , dropna()
intentional_data = [
    Row(OrderID=3001, CustomerName="Tharun", ProductCategory="Electronics", OrderAmount=999.99, OrderDate="2025-01-01", DeliveryStatus="Delivered", Discount=0.1, City=None, PaymentMode="UPI", CustomerSince="2019-03-10", FinalAmount=899.99),
    Row(OrderID=3002, CustomerName="Atithya", ProductCategory="Clothing", OrderAmount=459.50, OrderDate="2025-01-02", DeliveryStatus="Pending", Discount=0.05, City=None, PaymentMode="Cash", CustomerSince="2020-07-22", FinalAmount=436.53),
    Row(OrderID=3003, CustomerName="Lara Tyber", ProductCategory="Books", OrderAmount=120.00, OrderDate="2025-01-03", DeliveryStatus="Delivered", Discount=0.0, City=None, PaymentMode="Net Banking", CustomerSince="2018-12-05", FinalAmount=120.00),
    Row(OrderID=3004, CustomerName="Eren", ProductCategory="Toys", OrderAmount=340.00, OrderDate="2025-01-04", DeliveryStatus="Cancelled", Discount=0.2, City=None, PaymentMode="Credit Card", CustomerSince="2021-01-01", FinalAmount=272.00),
    Row(OrderID=3005, CustomerName="AK", ProductCategory="Home", OrderAmount=800.00, OrderDate="2025-01-05", DeliveryStatus="Delivered", Discount=0.1, City=None, PaymentMode="Debit Card", CustomerSince="2022-06-18", FinalAmount=720.00),
]
# intentional_df = spark.createDataFrame(intentional_data)

# df = df.union(intentional_df)

df.show(5)

df.fillna("Chennai").show(5)

df.dropna().show(5)

+-------+--------------+---------------+-----------+----------+--------------+--------+----------------+-----------+-------------+-----------+
|OrderID|  CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|FinalAmount|
+-------+--------------+---------------+-----------+----------+--------------+--------+----------------+-----------+-------------+-----------+
|   2824| Donald Walker|          Books|     783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|     665.58|
|   7912|  Brandon Hall|      Groceries|      905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|     877.85|
|   4611|  Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|     651.38|
|   3547|Phillip Garcia|        Fashion|     606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|     515.86|

In [62]:
#5.2 Use .when().otherwise() in PySpark to tag high-value customers ( Amount > 800 ).
df.withColumn("Label", F.when(df.OrderAmount > 800, "High-Value Customers").otherwise("-1")).show()

+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+-----------+--------------------+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|FinalAmount|               Label|
+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+-----------+--------------------+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|     665.58|                  -1|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|     877.85|High-Value Customers|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|     651.38|        

# **Date & Time Functions**

In [63]:
# 6.1 Extract year and month from OrderDate .
df.select("OrderDate", F.year("OrderDate").alias("Year"), F.month("OrderDate").alias("Month")).show()

+----------+----+-----+
| OrderDate|Year|Month|
+----------+----+-----+
|2024-12-26|2024|   12|
|2024-09-12|2024|    9|
|2025-01-12|2025|    1|
|2024-03-24|2024|    3|
|2024-08-04|2024|    8|
|2024-01-13|2024|    1|
|2024-03-04|2024|    3|
|2023-10-07|2023|   10|
|2023-06-27|2023|    6|
|2024-10-14|2024|   10|
|2024-04-08|2024|    4|
|2024-10-02|2024|   10|
|2023-12-15|2023|   12|
|2024-12-14|2024|   12|
|2024-09-18|2024|    9|
|2024-09-10|2024|    9|
|2023-08-03|2023|    8|
|2023-12-25|2023|   12|
|2024-03-19|2024|    3|
|2023-11-28|2023|   11|
+----------+----+-----+
only showing top 20 rows



In [66]:
# 6.2 Calculate customer loyalty in years = today - CustomerSince .
df.withColumn("Customer Loyalty", F.round((F.date_diff(F.current_date(), df["CustomerSince"])/365))).show()

+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+-----------+----------------+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|FinalAmount|Customer Loyalty|
+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+-----------+----------------+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|     665.58|             5.0|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|     877.85|             3.0|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|     651.38|             4.0|
|   3547| 

# **Joins and Unions**

In [68]:
# 7.1 Create a second DataFrame with city-wise region mapping.
data = [
    Row(City="Lake Roberto", Region="East"),
    Row(City="Port Jesseville", Region="West"),
    Row(City="Grayside", Region="South"),
    Row(City="Lake Joseph", Region="North"),
    Row(City="Mariastad", Region="West")
]

second_df = spark.createDataFrame(data)
second_df.show()

+---------------+------+
|           City|Region|
+---------------+------+
|   Lake Roberto|  East|
|Port Jesseville|  West|
|       Grayside| South|
|    Lake Joseph| North|
|      Mariastad|  West|
+---------------+------+



In [70]:
# 7.2 Perform inner and left joins with the main dataset.
df.join(second_df, how="inner", on="City").show()
df.join(second_df, how="left", on="city").show()

+---------------+-------+--------------+---------------+-----------+----------+--------------+--------+-----------+-------------+-----------+------+
|           City|OrderID|  CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|PaymentMode|CustomerSince|FinalAmount|Region|
+---------------+-------+--------------+---------------+-----------+----------+--------------+--------+-----------+-------------+-----------+------+
|Port Jesseville|   4150|   Amber Perez|          Books|     352.37|2024-01-13|     Cancelled|    0.24|       Cash|   2022-01-13|      267.8|  West|
|   Lake Roberto|   4611|  Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|     Wallet|   2021-08-07|     651.38|  East|
|    Lake Joseph|   5554|    Roy Martin|           Toys|     148.33|2024-03-04|     Cancelled|    0.27|       Cash|   2023-04-29|     108.28| North|
|       Grayside|   2169|Carolyn Daniel|    Electronics|      14.09|2023-10-07|     Delivered|    0.25|Cre

In [71]:
# 7.3 Union two datasets: e.g., orders from 2023 and 2024.
df_2023 = df.filter(F.year(df.OrderDate) == 2023)
df_2024 = df.filter(F.year(df.OrderDate) == 2024)

df_2023.union(df_2024).show()

+-------+----------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+-----------+
|OrderID|    CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|FinalAmount|
+-------+----------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+-----------+
|   2169|  Carolyn Daniel|    Electronics|      14.09|2023-10-07|     Delivered|    0.25|         Grayside|Credit Card|   2021-05-09|      10.57|
|   6313|     Patty Perez|      Groceries|      79.83|2023-06-27|     Cancelled|    0.12|      Richardland|Credit Card|   2021-04-25|      70.25|
|   2040|   Kyle Mcdonald|           Toys|     327.52|2023-12-15|      Returned|    0.06|Lake Jenniferside|     Wallet|   2021-07-21|     307.87|
|   6038|   David Bradley|        Fashion|     348.51|2023-08-03|      Returned|    0.23|    Lake Toddland|        UPI|   20

# **Complex JSON Simulation**

In [81]:
# 8.1 Convert each order to a JSON string and load it back into a DataFrame.
jso = df.toJSON()
jso.saveAsTextFile("jso.json")
dfJSON = spark.read.json(jso)
dfJSON.show()

+-----------------+------------------+-------------+--------------+--------+-----------+-----------+----------+-------+-----------+---------------+
|             City|      CustomerName|CustomerSince|DeliveryStatus|Discount|FinalAmount|OrderAmount| OrderDate|OrderID|PaymentMode|ProductCategory|
+-----------------+------------------+-------------+--------------+--------+-----------+-----------+----------+-------+-----------+---------------+
|     Lake Joyside|     Donald Walker|   2020-10-15|      Returned|    0.15|     665.58|     783.04|2024-12-26|   2824|Credit Card|          Books|
|    New Jamesside|      Brandon Hall|   2022-03-15|     Cancelled|    0.03|     877.85|      905.0|2024-09-12|   7912|     Wallet|      Groceries|
|     Lake Roberto|      Donald Booth|   2021-08-07|      Returned|    0.01|     651.38|     657.96|2025-01-12|   4611|     Wallet|        Fashion|
| West Melanieview|    Phillip Garcia|   2020-08-08|      Returned|    0.15|     515.86|     606.89|2024-03-24| 

In [85]:
# 8.2 Access nested fields using explode() and get_json_object().
print(F.get_json_object("/content/jso.json/part-00000", "$.CustomerName"))
print(F.explode(r"/content/jso.json/part-00000"))

jf = spark.read.json(r"/content/jso.json/part-00000")
jf.show()

Column<'get_json_object(/content/jso.json/part-00000, $.CustomerName)'>
Column<'explode(/content/jso.json/part-00000)'>
+-----------------+------------------+-------------+--------------+--------+-----------+-----------+----------+-------+-----------+---------------+
|             City|      CustomerName|CustomerSince|DeliveryStatus|Discount|FinalAmount|OrderAmount| OrderDate|OrderID|PaymentMode|ProductCategory|
+-----------------+------------------+-------------+--------------+--------+-----------+-----------+----------+-------+-----------+---------------+
|     Lake Joyside|     Donald Walker|   2020-10-15|      Returned|    0.15|     665.58|     783.04|2024-12-26|   2824|Credit Card|          Books|
|    New Jamesside|      Brandon Hall|   2022-03-15|     Cancelled|    0.03|     877.85|      905.0|2024-09-12|   7912|     Wallet|      Groceries|
|     Lake Roberto|      Donald Booth|   2021-08-07|      Returned|    0.01|     651.38|     657.96|2025-01-12|   4611|     Wallet|        F

# **Applying Functions**

In [95]:
# 9.1 Create a function to tag orders: “Big”, “Medium”, “Small” based on Amount.

def tagger(amt, avg):
    if amt == avg:
        return "Medium"
    elif amt > avg:
        return "Big"
    else:
        return "Small"

In [98]:
# 9.2 Apply it using .apply() in Pandas, and UDF in PySpark.
func_1 = F.udf(tagger, StringType())

df.withColumn("tags", func_1(df['OrderAmount'], F.lit(300))).show()

+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+-----------+-----+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|FinalAmount| tags|
+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+-----------+-----+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|     665.58|  Big|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|     877.85|  Big|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|     651.38|  Big|
|   3547|    Phillip Garcia|        Fashion|     606.89|2024-03-24|      Ret