In [0]:
## 1. Load the JSON data:
# Load the product_data.json file into a DataFrame.
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("ProductID", IntegerType(), True),
    StructField("ProductName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Price", IntegerType(), True),
    StructField("Stock", IntegerType(), True)
])
df = spark.read.json("file:/Workspace/Shared/Product_data.json", schema=schema)
# Display the first 10 rows and inspect the schema.
df.show()
df.printSchema()

+---------+-----------+-----------+-----+-----+
|ProductID|ProductName|   Category|Price|Stock|
+---------+-----------+-----------+-----+-----+
|      101|     Laptop|Electronics| 1200|   35|
|      102| Smartphone|Electronics|  800|   80|
|      103| Desk Chair|  Furniture|  150|   60|
|      104|    Monitor|Electronics|  300|   45|
|      105|       Desk|  Furniture|  350|   25|
+---------+-----------+-----------+-----+-----+

root
 |-- ProductID: integer (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Stock: integer (nullable = true)



In [0]:
## 2. Data Cleaning:
# Remove rows where Stock is less than 30.
cleaned_df = df.filter(df.Stock >= 30)
cleaned_df.show()
# Filter the products that belong to the "Electronics" category.
Electronics_df = cleaned_df.filter(cleaned_df.Category == "Electronics")
Electronics_df.show()

+---------+-----------+-----------+-----+-----+
|ProductID|ProductName|   Category|Price|Stock|
+---------+-----------+-----------+-----+-----+
|      101|     Laptop|Electronics| 1200|   35|
|      102| Smartphone|Electronics|  800|   80|
|      103| Desk Chair|  Furniture|  150|   60|
|      104|    Monitor|Electronics|  300|   45|
+---------+-----------+-----------+-----+-----+

+---------+-----------+-----------+-----+-----+
|ProductID|ProductName|   Category|Price|Stock|
+---------+-----------+-----------+-----+-----+
|      101|     Laptop|Electronics| 1200|   35|
|      102| Smartphone|Electronics|  800|   80|
|      104|    Monitor|Electronics|  300|   45|
+---------+-----------+-----------+-----+-----+



In [0]:
## 3. Data Aggregation:
# Calculate the total stock for products in the "Furniture" category.
Total_stocks = df.filter(cleaned_df.Category == "Furniture").agg({"Stock":"sum"})
Total_stocks.show()
# Find the average price of all products in the dataset.
avg_price = df.agg({"Price":"avg"})
avg_price.show()

+----------+
|sum(Stock)|
+----------+
|        85|
+----------+

+----------+
|avg(Price)|
+----------+
|     560.0|
+----------+



In [0]:
## 4. Write the Data to JSON:
# Save the cleaned and aggregated data into a new JSON file.
cleaned_df.write.json("cleaned_data.json")
Total_stocks.write.json("total_stock.json")
avg_price.write.json("avg_price.json")