In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark = SparkSession.builder.appName("SalesDataAnalysis").getOrCreate()


1. Data Ingestion & Exploration

In [18]:
from google.colab import drive
drive.mount('/content/drive')
customers_df = spark.read.csv("/content/drive/MyDrive/customers.csv", header=True, inferSchema=True)
orders_df= spark.read.csv("/content/drive/MyDrive/orders.csv", header=True, inferSchema=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
customers_df.printSchema()
customers_df.show()
orders_df.printSchema()
orders_df.show()

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)

+----------+-----+-----------------+---------+----------+
|CustomerID| Name|            Email|     City|SignupDate|
+----------+-----+-----------------+---------+----------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|
+----------+-----+-----------------+---------+----------+

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = t

In [6]:
print("Total Customers:", customers_df.count())
print("Total Orders:", orders_df.count())


Total Customers: 5
Total Orders: 7


In [7]:
customers_df.select("City").distinct().show()

+---------+
|     City|
+---------+
|Bangalore|
|  Chennai|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



2. DataFrame Transformations

In [8]:
orders_df = orders_df.withColumn("TotalAmount", col("Price") * col("Quantity"))
orders_df = orders_df.withColumn("OrderYear", year(col("OrderDate")))
orders_df.filter(col("TotalAmount") > 10000).show()
customers_df = customers_df.drop("Email")

+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+



3. Handling Nulls & Conditionals

In [9]:
customers_df = customers_df.withColumn("City", when(col("City").isNull(), "Unknown").otherwise(col("City")))
customers_df = customers_df.withColumn("CustomerType", when(col("SignupDate") < "2022-01-01", "Loyal").otherwise("New"))
orders_df = orders_df.withColumn("OrderType", when(col("TotalAmount") < 5000, "Low").otherwise("High"))



4. Joins & Aggregations

In [11]:
joined_df = orders_df.join(customers_df, on="CustomerID", how="inner")
joined_df.groupBy("City").agg(count("OrderID").alias("TotalOrders"), sum("TotalAmount").alias("TotalRevenue")).show()


+---------+-----------+------------+
|     City|TotalOrders|TotalRevenue|
+---------+-----------+------------+
|Bangalore|          1|      3500.0|
|  Chennai|          1|      2500.0|
|   Mumbai|          2|    101200.0|
|    Delhi|          2|     50000.0|
|Hyderabad|          1|      5000.0|
+---------+-----------+------------+



In [12]:
joined_df.groupBy("CustomerID").agg(sum("TotalAmount").alias("TotalSpend")).orderBy(desc("TotalSpend")).limit(3).show()

+----------+----------+
|CustomerID|TotalSpend|
+----------+----------+
|       101|  101200.0|
|       102|   50000.0|
|       104|    5000.0|
+----------+----------+



In [13]:
orders_df.groupBy("Category").agg(sum("Quantity").alias("TotalProductsSold")).show()

+-----------+-----------------+
|   Category|TotalProductsSold|
+-----------+-----------------+
| Stationery|                5|
|Electronics|                5|
|  Furniture|                1|
| Appliances|                1|
+-----------+-----------------+



5. Spark SQL Tasks

In [14]:
spark.sql("CREATE DATABASE IF NOT EXISTS sales")
spark.sql("USE sales")
customers_df.write.mode("overwrite").saveAsTable("sales.customers")
orders_df.write.mode("overwrite").saveAsTable("sales.orders")

In [15]:
spark.sql("SELECT * FROM sales.orders o JOIN sales.customers c ON o.CustomerID = c.CustomerID WHERE c.City = 'Delhi'").show()
spark.sql("SELECT Category, AVG(Price * Quantity) AS AvgOrderValue FROM sales.orders GROUP BY Category").show()
spark.sql("CREATE OR REPLACE VIEW monthly_orders AS SELECT MONTH(OrderDate) AS Month, SUM(Price * Quantity) AS TotalAmount FROM sales.orders GROUP BY MONTH(OrderDate)")
spark.sql("SELECT * FROM monthly_orders").show()

+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+----------+----+-----+----------+------------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|CustomerID|Name| City|SignupDate|CustomerType|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+----------+----+-----+----------+------------+
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|     High|       102|Neha|Delhi|2023-01-15|         New|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|     High|       102|Neha|Delhi|2023-01-15|         New|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+----------+----+-----+----------+------------+

+-----------+-------------+
|   Category|AvgOrderValue|
+-----------+-------------+
| Stationery|       2500.0|
|Elec

6. String & Date Functions

In [19]:
customers_df = customers_df.withColumn("MaskedEmail", regexp_replace(col("Email"), r"(\w)\w+@", "$1***@"))
customers_df = customers_df.withColumn("NameCity", concat_ws(" from ", col("Name"), col("City")))
customers_df = customers_df.withColumn("CustomerAgeDays", datediff(current_date(), col("SignupDate")))
orders_df = orders_df.withColumn("OrderMonthName", date_format(col("OrderDate"), "MMMM"))


7. UDFs and Complex Logic

In [20]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
def tag_customer(spend):
    if spend > 50000:
        return "Gold"
    elif spend >= 10000:
        return "Silver"
    else:
        return "Bronze"
tag_customer_udf = udf(tag_customer, StringType())
spend_df = joined_df.groupBy("CustomerID").agg(sum("TotalAmount").alias("TotalSpend"))
spend_df = spend_df.withColumn("Tier", tag_customer_udf(col("TotalSpend")))
spend_df.show()
def shorten_product(name):
    return name[:3] + "..." if name else None

shorten_udf = udf(shorten_product, StringType())
orders_df = orders_df.withColumn("ShortProduct", shorten_udf(col("Product")))



+----------+----------+------+
|CustomerID|TotalSpend|  Tier|
+----------+----------+------+
|       101|  101200.0|  Gold|
|       103|    3500.0|Bronze|
|       102|   50000.0|Silver|
|       105|    2500.0|Bronze|
|       104|    5000.0|Bronze|
+----------+----------+------+



8. Parquet & Views

In [21]:
# 8. Parquet & Views
joined_df.write.mode("overwrite").parquet("/content/drive/MyDrive/joined_sales.parquet")
parquet_df = spark.read.parquet("/content/drive/MyDrive/joined_sales.parquet")
parquet_df.printSchema()

parquet_df.createOrReplaceGlobalTempView("global_sales_view")
spark.sql("SELECT City, SUM(TotalAmount) AS Revenue FROM global_temp.global_sales_view GROUP BY City").show()


root
 |-- CustomerID: integer (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- OrderYear: integer (nullable = true)
 |-- OrderType: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)
 |-- CustomerType: string (nullable = true)

+---------+--------+
|     City| Revenue|
+---------+--------+
|Bangalore|  3500.0|
|  Chennai|  2500.0|
|   Mumbai|101200.0|
|    Delhi| 50000.0|
|Hyderabad|  5000.0|
+---------+--------+

