Write PySpark code to create a DataFrame from a list of tuples containing (name, age, city) and display it.
Example data: [("Alice", 25, "NYC"), ("Bob", 30, "LA"), ("Charlie", 35, "Chicago")]

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

spark = SparkSession.builder.appName("practice").getOrCreate()
data = [("Alice", 25, "NYC"), ("Bob", 30, "LA"), ("Charlie", 35, "Chicago")]
schema = StructType([
    StructField("name", StringType()),
    StructField("age", IntegerType()),
    StructField("city", StringType())
])
df = spark.createDataFrame(data, schema)
df.show()


Question 2 (Easy-Medium):
Given a DataFrame with columns ["product", "category", "price"], write PySpark code to:

Filter products where price > 100
Group by category and count the number of products in each category
Sort the results by count in descending order

Sample data:

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructField, StructType, IntegerType
from pyspark.sql.functions import col, count as agg_count
spark = SparkSession.builder.appName("practice").getOrCreate()
schema = StructType([
    StructField("product", StringType()),
    StructField("category", StringType()),
    StructField("price", IntegerType())
])
path = "dbfs:/raw/data/products.csv"
df = spark.read.format("csv").option("header", "True").schema(schema).load(path)
df = df.filter(col("price") > 100)
df = df.groupBy("category").agg(agg_count("product").alias("count_of_products"))
df = df.orderBy(col("count_of_products").desc())
df.show()

Question 3 (Medium):
Given two DataFrames:

df_orders: columns ["order_id", "customer_id", "amount"]
df_customers: columns ["customer_id", "customer_name", "country"]

Write PySpark code to:

Perform an inner join on customer_id
Calculate the total amount spent per country
Show only countries where total spending > 1000
Display the results sorted by total amount in descending order

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import broadcast, sum as agg_sum, col

spark = SparkSession.builder.appName("practice").getOrCreate()
order_schema = StructType([
    StructField("order_id", IntegerType()),
    StructField("customer_id", IntegerType()),
    StructField("amount", IntegerType()),
])

customer_schema = StructType([
    StructField("customer_id", IntegerType()),
    StructField("customer_name", StringType()),
    StructField("country", StringType())
])

orders_path = "dbfs:/raw/data/orders.parquet"
customers_path = "dbfs:/raw/data/customers.parquet"

orders_df = spark.read.format("parquet").schema(order_schema).load(orders_path)
customer_df = spark.read.format("parquet").schema(customer_schema).load(customers_path)

joined_df = orders_df.join(broadcast(customer_df), "customer_id", "inner")
df = joined_df.groupBy("country").agg(agg_sum("amount").alias("total_amount"))
df = df.filter(col("total_amount") > 1000).orderBy(col("total_amount").desc())
df.show()

Write PySpark code to find duplicate records based on multiple columns. Given a DataFrame with columns ["email", "phone", "name"], identify and display all rows where the combination of email AND phone appears more than once.

In [None]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StructField, StructType, StringType
from pyspark.sql.functions import col, concat, count

spark = SparkSession.builder.appName("practice").getOrCreate()
schema = StructType([
    StructField("email", StringType()),
    StructField("phone", StringType()),
    StructField("name", StringType())
])

data_path = "dbfs:/raw/data/records.csv"
df = spark.read.format("csv").option("header", "True").schema(schema).load(data_path)
df = df.withColumn("key", concat(col("email"), col("phone")))
window_spec = Window.partitionBy("key")
df = df.withColumn("rec_count", count("*").over(window_spec))
df = df.filter(col("rec_count") > 1).drop(col("rec_count")).drop(col("key"))
df = df.show()
