# Superstore PySpark Assignment
This notebook answers Questions 9–14 from the Module 1 Assignment using PySpark and the Superstore dataset.

In [None]:
# Install Spark and Java (only needed on Colab)
!apt-get install openjdk-17-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar xf spark-3.4.1-bin-hadoop3.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"

import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, max as spark_max, count, expr
from pyspark.sql.types import DoubleType

# Initialize Spark
spark = SparkSession.builder.appName("SuperstoreAssignment").getOrCreate()

In [None]:
# Upload the Superstore CSV manually
from google.colab import files
uploaded = files.upload()

In [None]:
# Load CSV into Spark
df = spark.read.csv("Superstore.csv", header=True, inferSchema=True)

# Cast relevant columns to numeric
df = df.withColumn("Sales", expr("try_cast(Sales as double)"))
df = df.withColumn("Profit", expr("try_cast(Profit as double)"))

### Q9: Who is the customer that generated the most sales?

In [None]:
top_customer = df.groupBy("Customer Name").sum("Sales").orderBy("sum(Sales)", ascending=False)
top_customer.show(1)

### Q10: Highest average sales per transaction for any city

In [None]:
avg_sales = df.groupBy("City").agg(avg("Sales").alias("avg_sales")).orderBy("avg_sales", ascending=False)
avg_sales.show(1)

### Q11: Highest total profit for any item

In [None]:
profit_by_item = df.groupBy("Item").agg(spark_max("Profit").alias("max_profit")).orderBy("max_profit", ascending=False)
profit_by_item.show(1)

### Q12: Largest number of transactions for any combination of state and customer segment

In [None]:
transactions = df.groupBy("State", "Customer Segment").agg(count("Order ID").alias("transaction_count")).orderBy("transaction_count", ascending=False)
transactions.show(1)

### Q13: Highest average profit for any city-state combination

In [None]:
df.createOrReplaceTempView("superstore")
result = spark.sql("""
SELECT City, State, AVG(Profit) as avg_profit
FROM superstore
GROUP BY City, State
ORDER BY avg_profit DESC
LIMIT 1
""")
result.show(1)