<a href="https://colab.research.google.com/github/Subramaniya-pillai/data_engineering/blob/main/sales_set.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
!pip install pandas pyspark dask




In [14]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SalesData").getOrCreate()

# Load CSV
df_spark = spark.read.csv('/content/drive/MyDrive/Sales_Dataset__500_Records_.csv', header=True, inferSchema=True)

# First 5 records
df_spark.show(5)

# Last 5 records (not direct; use order + limit)
df_spark.orderBy("OrderID", ascending=False).show(5)

# Schema
df_spark.printSchema()


+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|       Cash|   2022

# **2. Selection, Renaming, and Filtering**

In [23]:
df_selected = df_spark.select("OrderID", "CustomerName", "Amount")
df_renamed = df_selected.withColumnRenamed("Amount", "OrderAmount")
df_renamed.show()  #  Shows the renamed DataFrame

df_filtered_amount = df_spark.filter(col("Amount") > 500)
df_filtered_amount.show()  #  Show filtered result

df_filtered_city = df_spark.filter(col("City") == "New York")
df_filtered_city.show()  #  Show filtered result


+-------+------------------+-----------+
|OrderID|      CustomerName|OrderAmount|
+-------+------------------+-----------+
|   2824|     Donald Walker|     783.04|
|   7912|      Brandon Hall|      905.0|
|   4611|      Donald Booth|     657.96|
|   3547|    Phillip Garcia|     606.89|
|   8527|      Valerie Gray|      77.87|
|   4150|       Amber Perez|     352.37|
|   5554|        Roy Martin|     148.33|
|   2169|    Carolyn Daniel|      14.09|
|   6313|       Patty Perez|      79.83|
|   6155|Jonathan Wilkerson|     882.68|
|   9830|       Kevin Hurst|     870.55|
|   9085| Anthony Rodriguez|     921.73|
|   2040|     Kyle Mcdonald|     327.52|
|   6573|    Jeffrey Chavez|     676.02|
|   2743|  Elizabeth Fowler|      47.06|
|   9837|     Tammy Sellers|      46.15|
|   6038|     David Bradley|     348.51|
|   3060|       John Pierce|     362.09|
|   4295|   Jennifer Powers|     684.26|
|   5061|    George Chapman|     251.89|
+-------+------------------+-----------+
only showing top

# **3. Data Manipulation**

In [24]:
from pyspark.sql.functions import col, expr, when

# Add FinalAmount = Amount - (Amount * Discount)
df_spark = df_spark.withColumn("FinalAmount", col("Amount") - (col("Amount") * col("Discount")))

# Sort by FinalAmount descending
df_sorted = df_spark.orderBy(col("FinalAmount").desc())
df_sorted.select("OrderID", "Amount", "Discount", "FinalAmount").show()

# Replace "Cancelled" with "Order Cancelled" in DeliveryStatus
df_updated = df_sorted.withColumn(
    "DeliveryStatus",
    when(col("DeliveryStatus") == "Cancelled", "Order Cancelled").otherwise(col("DeliveryStatus"))
)

# Show updated delivery statuses
df_updated.select("OrderID", "DeliveryStatus").show()


+-------+------+--------+-----------------+
|OrderID|Amount|Discount|      FinalAmount|
+-------+------+--------+-----------------+
|   5573|981.05|    0.02|          961.429|
|   8474|968.91|    0.02|         949.5318|
|   8889| 998.3|    0.06|938.4019999999999|
|   2127|933.32|    0.01|         923.9868|
|   9806|993.17|    0.07|         923.6481|
|   5593|961.35|    0.05|         913.2825|
|   2120|948.84|    0.04|         910.8864|
|   5949|918.14|    0.01|908.9585999999999|
|   1422| 973.2|    0.07|          905.076|
|   2904|922.29|    0.02|         903.8442|
|   7566|899.31|     0.0|           899.31|
|   7511|932.21|    0.04|         894.9216|
|   9085|921.73|    0.03|894.0781000000001|
|   1436|978.96|    0.09|         890.8536|
|   6008|903.71|    0.02|         885.6358|
|   9834|944.55|    0.07|878.4314999999999|
|   8253|998.21|    0.12|         878.4248|
|   7912| 905.0|    0.03|           877.85|
|   1654|903.78|    0.03|         876.6666|
|   9239|897.41|    0.04|       

# **4. Aggregations and GroupBy**

In [17]:
# Count of orders by DeliveryStatus
df_spark.groupBy("DeliveryStatus").count().show()

# Average Amount by ProductCategory
df_spark.groupBy("ProductCategory").avg("Amount").show()

# Group by City and show total sales
df_spark.groupBy("City").sum("Amount").show()


+---------------+-----+
| DeliveryStatus|count|
+---------------+-----+
|       Returned|  117|
|      Delivered|  119|
|Order Cancelled|  149|
|        Pending|  115|
+---------------+-----+

+---------------+------------------+
|ProductCategory|       avg(Amount)|
+---------------+------------------+
|        Fashion| 500.6308235294116|
|      Groceries|459.51786407766957|
|    Electronics|           551.745|
|          Books| 568.6003773584907|
|           Toys| 534.2837499999999|
+---------------+------------------+

+----------------+-----------+
|            City|sum(Amount)|
+----------------+-----------+
|     Ramseymouth|     761.06|
|East Edwardshire|     291.26|
|      Thomasberg|     882.68|
|     Laurenville|     383.26|
| South Colinstad|     786.27|
|    Lake Douglas|     975.09|
|   Williamsmouth|      10.78|
|      Gordonport|     514.99|
|  West Dawnmouth|       12.8|
|        Seanbury|     814.39|
|     Sheilaville|     981.05|
|       Mollybury|     222.02|
|       

# 5. Null Handling & **Update**

In [25]:
from pyspark.sql.functions import when, col, lit

# Inject nulls in City where OrderID < 10
df_with_nulls = df_spark.withColumn("City", when(col("OrderID") < 10, lit(None)).otherwise(col("City")))

# Fill null values in City with "Unknown"
df_filled = df_with_nulls.fillna({"City": "Unknown"})
df_filled.select("OrderID", "City").show()

# Drop rows where City is still null (if any remain)
df_dropped = df_with_nulls.dropna(subset=["City"])
df_dropped.select("OrderID", "City").show()

# Tag high-value customers: Amount > 800
df_tagged = df_filled.withColumn("CustomerTag", when(col("Amount") > 800, "High-Value").otherwise("Regular"))
df_tagged.select("OrderID", "Amount", "CustomerTag").show()


+-------+-----------------+
|OrderID|             City|
+-------+-----------------+
|   2824|     Lake Joyside|
|   7912|    New Jamesside|
|   4611|     Lake Roberto|
|   3547| West Melanieview|
|   8527|        Mariastad|
|   4150|  Port Jesseville|
|   5554|      Lake Joseph|
|   2169|         Grayside|
|   6313|      Richardland|
|   6155|       Thomasberg|
|   9830|      Jeffreyberg|
|   9085|        Port Erin|
|   2040|Lake Jenniferside|
|   6573|      Teresaburgh|
|   2743|   East Nathaniel|
|   9837|       Tracyville|
|   6038|    Lake Toddland|
|   3060|       Brandtside|
|   4295|         Lammouth|
|   5061|       North Chad|
+-------+-----------------+
only showing top 20 rows

+-------+-----------------+
|OrderID|             City|
+-------+-----------------+
|   2824|     Lake Joyside|
|   7912|    New Jamesside|
|   4611|     Lake Roberto|
|   3547| West Melanieview|
|   8527|        Mariastad|
|   4150|  Port Jesseville|
|   5554|      Lake Joseph|
|   2169|         Gray

# **6. Date & Time Functions**


In [26]:
from pyspark.sql.functions import to_date, year, month, current_date, datediff, col

# Convert strings to proper date format
df_spark = df_spark.withColumn("OrderDate", to_date(col("OrderDate"))) \
                   .withColumn("CustomerSince", to_date(col("CustomerSince")))

# Extract year and month from OrderDate
df_spark = df_spark.withColumn("Year", year(col("OrderDate"))) \
                   .withColumn("Month", month(col("OrderDate")))

# Calculate LoyaltyYears = difference between today and CustomerSince
df_spark = df_spark.withColumn("LoyaltyYears", (datediff(current_date(), col("CustomerSince")) / 365).cast("int"))

# Show key output
df_spark.select("CustomerName", "CustomerSince", "LoyaltyYears", "Year", "Month").show()


+------------------+-------------+------------+----+-----+
|      CustomerName|CustomerSince|LoyaltyYears|Year|Month|
+------------------+-------------+------------+----+-----+
|     Donald Walker|   2020-10-15|           4|2024|   12|
|      Brandon Hall|   2022-03-15|           3|2024|    9|
|      Donald Booth|   2021-08-07|           3|2025|    1|
|    Phillip Garcia|   2020-08-08|           4|2024|    3|
|      Valerie Gray|   2022-11-15|           2|2024|    8|
|       Amber Perez|   2022-01-13|           3|2024|    1|
|        Roy Martin|   2023-04-29|           2|2024|    3|
|    Carolyn Daniel|   2021-05-09|           4|2023|   10|
|       Patty Perez|   2021-04-25|           4|2023|    6|
|Jonathan Wilkerson|   2021-06-20|           3|2024|   10|
|       Kevin Hurst|   2022-08-02|           2|2024|    4|
| Anthony Rodriguez|   2022-12-15|           2|2024|   10|
|     Kyle Mcdonald|   2021-07-21|           3|2023|   12|
|    Jeffrey Chavez|   2022-07-30|           2|2024|   1

# **7. Joins and Unions**

In [20]:
# Create a DataFrame for city-region mapping
region_data = [("New York", "East"), ("Los Angeles", "West"), ("Chicago", "Central")]
region_df = spark.createDataFrame(region_data, ["City", "Region"])

# Inner and left joins
df_inner = df_spark.join(region_df, on="City", how="inner")
df_left = df_spark.join(region_df, on="City", how="left")

# Union of two datasets (e.g., split by year)
df_2023 = df_spark.filter(col("Year") == 2023)
df_2024 = df_spark.filter(col("Year") == 2024)
df_union = df_2023.unionByName(df_2024)


# **8. Complex JSON Simulation**

In [21]:
from pyspark.sql.functions import to_json, struct, from_json, schema_of_json

# Convert each row to a JSON string
df_json = df_spark.withColumn("order_json", to_json(struct([col(c) for c in df_spark.columns])))

# Infer schema from one sample row
sample_json = df_json.select("order_json").first()["order_json"]
inferred_schema = schema_of_json(sample_json)

# Parse the JSON string back into a struct column
df_parsed = df_json.withColumn("parsed", from_json("order_json", inferred_schema))

# Access fields inside the parsed column
df_parsed.select("parsed.OrderID", "parsed.CustomerName", "parsed.Amount").show(5)


+-------+--------------+------+
|OrderID|  CustomerName|Amount|
+-------+--------------+------+
|   2824| Donald Walker|783.04|
|   7912|  Brandon Hall| 905.0|
|   4611|  Donald Booth|657.96|
|   3547|Phillip Garcia|606.89|
|   8527|  Valerie Gray| 77.87|
+-------+--------------+------+
only showing top 5 rows



# **9. Applying Functions**

In [27]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

# Define UDF for order tagging
def tag_order(amount):
    if amount is None:
        return "Unknown"
    elif amount > 800:
        return "Big"
    elif amount > 400:
        return "Medium"
    else:
        return "Small"

# Register UDF
tag_order_udf = udf(tag_order, StringType())

# Apply UDF to create OrderTag column
df_spark = df_spark.withColumn("OrderTag", tag_order_udf(col("Amount")))

# Show result
df_spark.select("OrderID", "Amount", "OrderTag").show()


+-------+------+--------+
|OrderID|Amount|OrderTag|
+-------+------+--------+
|   2824|783.04|  Medium|
|   7912| 905.0|     Big|
|   4611|657.96|  Medium|
|   3547|606.89|  Medium|
|   8527| 77.87|   Small|
|   4150|352.37|   Small|
|   5554|148.33|   Small|
|   2169| 14.09|   Small|
|   6313| 79.83|   Small|
|   6155|882.68|     Big|
|   9830|870.55|     Big|
|   9085|921.73|     Big|
|   2040|327.52|   Small|
|   6573|676.02|  Medium|
|   2743| 47.06|   Small|
|   9837| 46.15|   Small|
|   6038|348.51|   Small|
|   3060|362.09|   Small|
|   4295|684.26|  Medium|
|   5061|251.89|   Small|
+-------+------+--------+
only showing top 20 rows

