In [1]:
# Install PySpark
!pip install -q pyspark

# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, count, min, max
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Create Spark session
spark = SparkSession.builder.appName("BigDataNoUpload").getOrCreate()

# Create sample data
data = [
    ("Books", 150),
    ("Grocery", 300),
    ("Books", 200),
    ("Grocery", 250),
    ("Fashion", 400)
]

# Define schema
schema = StructType([
    StructField("category", StringType(), True),
    StructField("price", IntegerType(), True)
])

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# Show schema and data
print("📄 Dataset Schema:")
df.printSchema()
print("\n📊 Sample Data:")
df.show()

# Count records per category
print("\n🔢 Count by Category:")
df.groupBy("category").agg(count("*").alias("Total")).show()

# Average price
print("\n💰 Average Price:")
df.select(avg("price").alias("Average Price")).show()

# Min and Max price
print("\n📈 Min and Max Price:")
df.select(min("price").alias("Min Price"), max("price").alias("Max Price")).show()

# Stop Spark session
spark.stop()

📄 Dataset Schema:
root
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)


📊 Sample Data:
+--------+-----+
|category|price|
+--------+-----+
|   Books|  150|
| Grocery|  300|
|   Books|  200|
| Grocery|  250|
| Fashion|  400|
+--------+-----+


🔢 Count by Category:
+--------+-----+
|category|Total|
+--------+-----+
| Grocery|    2|
|   Books|    2|
| Fashion|    1|
+--------+-----+


💰 Average Price:
+-------------+
|Average Price|
+-------------+
|        260.0|
+-------------+


📈 Min and Max Price:
+---------+---------+
|Min Price|Max Price|
+---------+---------+
|      150|      400|
+---------+---------+

