In [91]:
from google.colab import drive
drive.mount('/content/drive')
from pyspark.sql import SparkSession
# Load the CSV file
spark = SparkSession.builder.appName("SalesData").getOrCreate()
df = spark.read.csv("/content/drive/MyDrive/Sales_dataset.csv", header=True, inferSchema=True)

#Display the first 5
df.show(5)

#Display last 5 records

row_count = df.count()
last = df.rdd.zipWithIndex().filter(lambda x: x[1] >= row_count - 5).map(lambda x: x[0])
for row in last.collect():
    print(row)

# Print schema
df.printSchema()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2

In [46]:
from pyspark.sql.functions import col

# Select only OrderID , CustomerName , and Amo
df.select('OrderID', 'CustomerName', 'Amount').show()

# Rename Amount to OrderAmo
df = df.withColumnRenamed('Amount', 'OrderAmount')

# Filter orders where Amount > 500

df.filter(col('OrderAmount') > 500).show()

#Filter orders from a specific city using .query() or .filter()

df.filter(col('City') == 'West Elizabeth').show()


+-------+------------------+------+
|OrderID|      CustomerName|Amount|
+-------+------------------+------+
|   2824|     Donald Walker|783.04|
|   7912|      Brandon Hall| 905.0|
|   4611|      Donald Booth|657.96|
|   3547|    Phillip Garcia|606.89|
|   8527|      Valerie Gray| 77.87|
|   4150|       Amber Perez|352.37|
|   5554|        Roy Martin|148.33|
|   2169|    Carolyn Daniel| 14.09|
|   6313|       Patty Perez| 79.83|
|   6155|Jonathan Wilkerson|882.68|
|   9830|       Kevin Hurst|870.55|
|   9085| Anthony Rodriguez|921.73|
|   2040|     Kyle Mcdonald|327.52|
|   6573|    Jeffrey Chavez|676.02|
|   2743|  Elizabeth Fowler| 47.06|
|   9837|     Tammy Sellers| 46.15|
|   6038|     David Bradley|348.51|
|   3060|       John Pierce|362.09|
|   4295|   Jennifer Powers|684.26|
|   5061|    George Chapman|251.89|
+-------+------------------+------+
only showing top 20 rows

+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-

In [48]:
from pyspark.sql.functions import expr

# Drop CustomerSince column
df = df.drop('CustomerSince')

# Add new column FinalAmount
df = df.withColumn("FinalAmount", col("OrderAmount") - (col("OrderAmount") * col("Discount")))
df.show()

# Sort by FinalAmount Descending
df = df.orderBy(col("FinalAmount").desc())

# Replace all “Cancelled” status with “Order Cancelled"
df = df.withColumn("DeliveryStatus", expr("CASE WHEN DeliveryStatus = 'Cancelled' THEN 'Order Cancelled' ELSE DeliveryStatus END"))
df.show()


+-------+-----------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-----------------+
|OrderID|     CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|      FinalAmount|
+-------+-----------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-----------------+
|   5573|   Jordan Frazier|          Books|     981.05|2025-03-19|  Order Cancel|    0.02|      Sheilaville|       Cash|          961.429|
|   8474|      Heidi Brown|    Electronics|     968.91|2023-11-23|  Order Cancel|    0.02|       Riverafort|       Cash|         949.5318|
|   8889|      Karen Garza|          Books|      998.3|2024-10-17|  Order Cancel|    0.06|       Johnsonton|Credit Card|938.4019999999999|
|   2127|     Jaclyn Moore|      Groceries|     933.32|2025-03-11|      Returned|    0.01|      Cherylhaven|       Cash|         923.9868|
|   9806|    Samantha Gill|

In [49]:
#Count of orders by DeliveryStatus
from pyspark.sql.functions import count,avg,sum
df.groupBy("DeliveryStatus").agg(count("*").alias("OrderCount")).show()


+--------------+----------+
|DeliveryStatus|OrderCount|
+--------------+----------+
|      Returned|       117|
|  Order Cancel|       149|
|     Delivered|       119|
|       Pending|       115|
+--------------+----------+



In [29]:
#Average Amount by ProductCategory
df.groupBy("ProductCategory").agg(avg("OrderAmount").alias("AverageAmount")).show()


+---------------+------------------+
|ProductCategory|     AverageAmount|
+---------------+------------------+
|        Fashion|500.63082352941205|
|      Groceries|459.51786407767014|
|    Electronics| 551.7450000000002|
|          Books| 568.6003773584907|
|           Toys| 534.2837499999999|
+---------------+------------------+



In [30]:
# Group by City and show total sales
df.groupBy("City").agg(sum("OrderAmount").alias("TotalSales")).show()


+----------------+----------+
|            City|TotalSales|
+----------------+----------+
|     Ramseymouth|    761.06|
|East Edwardshire|    291.26|
|    Lake Douglas|    975.09|
|      Thomasberg|    882.68|
| South Colinstad|    786.27|
|     Laurenville|    383.26|
|        Seanbury|    814.39|
|      Gordonport|    514.99|
|  West Dawnmouth|      12.8|
|   Williamsmouth|     10.78|
|     Sheilaville|    981.05|
|       Mollybury|    222.02|
|       Perezfort|    917.55|
| Lake Jerrymouth|    404.01|
|       Lisaville|     45.69|
|     Port Willie|    788.13|
|  South Samantha|    229.46|
|Port Nicoleshire|    133.78|
|Lake Rebeccabury|    891.66|
|      Valdezberg|    424.96|
+----------------+----------+
only showing top 20 rows



In [39]:
from pyspark.sql.functions import lit
from pyspark.sql.functions import when

df = df.withColumn("City", when(col("OrderAmount") < 100, lit(None)).otherwise(col("City")))
# Fill nulls
df_filled = df.fillna({"City": "Unknown"})

# Drop rows where City is null
df_dropped = df.dropna(subset=["City"])

In [37]:
# Use .when().otherwise() in PySpark to tag high-value customers ( Amount >
# 800).

df = df.withColumn(
    "CustomerType",
    when(col("OrderAmount") > 800, "High-Value").otherwise("Regular")
)

df.select("CustomerName", "OrderAmount", "CustomerType").show()


+-----------------+-----------+------------+
|     CustomerName|OrderAmount|CustomerType|
+-----------------+-----------+------------+
|   Jordan Frazier|     981.05|  High-Value|
|      Heidi Brown|     968.91|  High-Value|
|      Karen Garza|      998.3|  High-Value|
|     Jaclyn Moore|     933.32|  High-Value|
|    Samantha Gill|     993.17|  High-Value|
|   Kristy Johnson|     961.35|  High-Value|
| Alejandra Santos|     948.84|  High-Value|
|Dr. Michael Evans|     918.14|  High-Value|
|    Hunter Kramer|      973.2|  High-Value|
|   Michelle Burns|     922.29|  High-Value|
|    Jesus Houston|     899.31|  High-Value|
|      Regina Diaz|     932.21|  High-Value|
|Anthony Rodriguez|     921.73|  High-Value|
|     James Brooks|     978.96|  High-Value|
|  Brittany Knight|     903.71|  High-Value|
|   Timothy Duncan|     944.55|  High-Value|
|  Kevin Patterson|     998.21|  High-Value|
|     Brandon Hall|      905.0|  High-Value|
|   Frank Williams|     903.78|  High-Value|
|    Tracy

In [42]:
# Extract year and month from OrderDate
from pyspark.sql.functions import year,month
df = df.withColumn("OrderYear", year("OrderDate")).withColumn("OrderMonth", month("OrderDate"))

df.select("OrderDate", "OrderYear", "OrderMonth").show()


+----------+---------+----------+
| OrderDate|OrderYear|OrderMonth|
+----------+---------+----------+
|2025-03-19|     2025|         3|
|2023-11-23|     2023|        11|
|2024-10-17|     2024|        10|
|2025-03-11|     2025|         3|
|2024-11-12|     2024|        11|
|2025-05-24|     2025|         5|
|2024-05-09|     2024|         5|
|2023-07-29|     2023|         7|
|2024-09-02|     2024|         9|
|2025-05-01|     2025|         5|
|2024-11-19|     2024|        11|
|2025-02-06|     2025|         2|
|2024-10-02|     2024|        10|
|2024-03-28|     2024|         3|
|2025-02-19|     2025|         2|
|2024-06-04|     2024|         6|
|2024-08-17|     2024|         8|
|2024-09-12|     2024|         9|
|2025-05-27|     2025|         5|
|2025-02-14|     2025|         2|
+----------+---------+----------+
only showing top 20 rows



In [80]:
#Calculate customer loyalty in years = today - CustomerSince
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, month, col, current_date, months_between

spark = SparkSession.builder.appName("SalesDateTimeAnalysis").getOrCreate()

# Load CSV with CustomerSince included
df = spark.read.csv("/content/drive/MyDrive/Sales_dataset.csv", header=True, inferSchema=True)

df = df.withColumn("LoyaltyYears", (months_between(current_date(), col("CustomerSince")) / 12).cast("int"))

df.select("OrderID", "CustomerName", "CustomerSince", "LoyaltyYears").show(truncate=False)


+-------+------------------+-------------+------------+
|OrderID|CustomerName      |CustomerSince|LoyaltyYears|
+-------+------------------+-------------+------------+
|2824   |Donald Walker     |2020-10-15   |4           |
|7912   |Brandon Hall      |2022-03-15   |3           |
|4611   |Donald Booth      |2021-08-07   |3           |
|3547   |Phillip Garcia    |2020-08-08   |4           |
|8527   |Valerie Gray      |2022-11-15   |2           |
|4150   |Amber Perez       |2022-01-13   |3           |
|5554   |Roy Martin        |2023-04-29   |2           |
|2169   |Carolyn Daniel    |2021-05-09   |4           |
|6313   |Patty Perez       |2021-04-25   |4           |
|6155   |Jonathan Wilkerson|2021-06-20   |3           |
|9830   |Kevin Hurst       |2022-08-02   |2           |
|9085   |Anthony Rodriguez |2022-12-15   |2           |
|2040   |Kyle Mcdonald     |2021-07-21   |3           |
|6573   |Jeffrey Chavez    |2022-07-30   |2           |
|2743   |Elizabeth Fowler  |2021-02-07   |4     

In [87]:
# 7. Joins and Unions:
# Create a second DataFrame with city-wise region mapping.
from pyspark.sql.functions import when, col

region_df = df.withColumn(
    "Region",
    when(col("City").rlike("(?i)\\bwest\\b"), "West")
    .when(col("City").rlike("(?i)\\beast\\b"), "East")
    .when(col("City").rlike("(?i)\\bnorth\\b"), "North")
    .when(col("City").rlike("(?i)\\bsouth\\b"), "South")
    .otherwise("Unknown")
)

region_df.select("City", "Region").distinct().show(truncate=False)



+------------------+-------+
|City              |Region |
+------------------+-------+
|Lake Joseph       |Unknown|
|Westtown          |Unknown|
|Frenchburgh       |Unknown|
|Lisaview          |Unknown|
|Crosbyton         |Unknown|
|Staceyborough     |Unknown|
|South Daniel      |South  |
|Phillipston       |Unknown|
|Port Robertfort   |Unknown|
|North Richardton  |North  |
|Millerton         |Unknown|
|South Andrea      |South  |
|North Jamesborough|North  |
|West Robertville  |West   |
|Williamview       |Unknown|
|Scottmouth        |Unknown|
|Leeberg           |Unknown|
|Lake Marc         |Unknown|
|Bushchester       |Unknown|
|Jennifershire     |Unknown|
+------------------+-------+
only showing top 20 rows



In [90]:
# Perform inner and left joins with the main dataset.
# Alias the DataFrames
df_alias = df.alias("o")         # o = orders
region_df_alias = region_df.alias("r")  # r = region

print("INNER Join")
inner_join_df = df_alias.join(region_df_alias, df_alias.City == region_df_alias.City, "inner")

inner_join_df.select("o.OrderID","o.CustomerName","o.City","r.Region").show()
print("LEFT Join")
left_join_df = df_alias.join(region_df_alias, df_alias.City == region_df_alias.City, "left")

left_join_df.select("o.OrderID","o.CustomerName","o.City","r.Region").show()

INNER Join
+-------+------------------+-----------------+-------+
|OrderID|      CustomerName|             City| Region|
+-------+------------------+-----------------+-------+
|   2824|     Donald Walker|     Lake Joyside|Unknown|
|   7912|      Brandon Hall|    New Jamesside|Unknown|
|   4611|      Donald Booth|     Lake Roberto|Unknown|
|   3547|    Phillip Garcia| West Melanieview|   West|
|   8527|      Valerie Gray|        Mariastad|Unknown|
|   4150|       Amber Perez|  Port Jesseville|Unknown|
|   5554|        Roy Martin|      Lake Joseph|Unknown|
|   2169|    Carolyn Daniel|         Grayside|Unknown|
|   6313|       Patty Perez|      Richardland|Unknown|
|   6155|Jonathan Wilkerson|       Thomasberg|Unknown|
|   9830|       Kevin Hurst|      Jeffreyberg|Unknown|
|   9085| Anthony Rodriguez|        Port Erin|Unknown|
|   2040|     Kyle Mcdonald|Lake Jenniferside|Unknown|
|   6573|    Jeffrey Chavez|      Teresaburgh|Unknown|
|   2743|  Elizabeth Fowler|   East Nathaniel|   East|

In [71]:
# Union two datasets: e.g., orders from 2023 and 2024.
orders_2023 = df.filter(year("OrderDate") == 2023)
orders_2024 = df.filter(year("OrderDate") == 2024)

union_df = orders_2023.unionByName(orders_2024)

union_df.select("OrderID", "CustomerName", "OrderDate").show(10)

+-------+----------------+----------+
|OrderID|    CustomerName| OrderDate|
+-------+----------------+----------+
|   2169|  Carolyn Daniel|2023-10-07|
|   6313|     Patty Perez|2023-06-27|
|   2040|   Kyle Mcdonald|2023-12-15|
|   6038|   David Bradley|2023-08-03|
|   3060|     John Pierce|2023-12-25|
|   5061|  George Chapman|2023-11-28|
|   1964|    Taylor Heath|2023-07-28|
|   2612|Nicholas Mcbride|2023-08-26|
|   1828|  Kimberly Smith|2023-08-14|
|   4502|Stephanie Martin|2023-06-04|
+-------+----------------+----------+
only showing top 10 rows



In [92]:
# 8. Complex JSON Simulation
# Convert each order to a JSON string and load it back into a Dataframe
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, struct, to_json, from_json, get_json_object

df_json = df.withColumn("order_json", to_json(struct([col(c) for c in df.columns])))

df_json.select("OrderID", "order_json").show(5, truncate=False)

# Load JSON string back into DataFrame
parsed_df = df_json.withColumn("parsed_json", from_json(col("order_json"), df.schema))

parsed_df.select("OrderID", "parsed_json").show(5, truncate=False)


+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|OrderID|order_json                                                                                                                                                                                                                                     |
+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|2824   |{"OrderID":2824,"CustomerName":"Donald Walker","ProductCategory":"Books","Amount":783.04,"OrderDate":"2024-12-26","DeliveryStatus":"Returned","Discount":0.15,"City":"Lake Joyside","PaymentMode":"Credit Card","CustomerSince":"2020-10-15"}  |


In [67]:
#Access nested fields using get_json_object() .
from pyspark.sql.functions import get_json_object

parsed_df.select(get_json_object(col("json_order"), "$.CustomerName").alias("CustomerName"),
    get_json_object(col("json_order"), "$.Amount").alias("Amount")
).show(5)


+--------------+------+
|  CustomerName|Amount|
+--------------+------+
| Donald Walker|783.04|
|  Brandon Hall| 905.0|
|  Donald Booth|657.96|
|Phillip Garcia|606.89|
|  Valerie Gray| 77.87|
+--------------+------+
only showing top 5 rows



In [66]:
#9. Applying Functions:
# Create a function to tag orders: “Big”, “Medium”, “Small” based on Amount.
# Apply it using UDF in PySpark.
def tag_order(amount):
    if amount >= 800:
        return "Big"
    elif amount >= 500:
        return "Medium"
    else:
        return "Small"

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

tag_udf = udf(tag_order, StringType())

df = df.withColumn("OrderSize", tag_udf(col("Amount")))

df.select("OrderID", "Amount", "OrderSize").show()

+-------+------+---------+
|OrderID|Amount|OrderSize|
+-------+------+---------+
|   2824|783.04|   Medium|
|   7912| 905.0|      Big|
|   4611|657.96|   Medium|
|   3547|606.89|   Medium|
|   8527| 77.87|    Small|
|   4150|352.37|    Small|
|   5554|148.33|    Small|
|   2169| 14.09|    Small|
|   6313| 79.83|    Small|
|   6155|882.68|      Big|
|   9830|870.55|      Big|
|   9085|921.73|      Big|
|   2040|327.52|    Small|
|   6573|676.02|   Medium|
|   2743| 47.06|    Small|
|   9837| 46.15|    Small|
|   6038|348.51|    Small|
|   3060|362.09|    Small|
|   4295|684.26|   Medium|
|   5061|251.89|    Small|
+-------+------+---------+
only showing top 20 rows

