In [10]:
pip install pyspark




In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("E-commerce Transactions") \
    .getOrCreate()


com_data = [
    (1, 101, "Laptop", "Electronics", 1000, 1, 10, "2023-08-01"),
    (2, 102, "Smartphone", "Electronics", 700, 2, 5, "2023-08-01"),
    (3, 103, "Shirt", "Fashion", 40, 3, 0, "2023-08-02"),
    (4, 104, "Blender", "Home Appliance", 150, 1, 15, "2023-08-03"),
    (5, 101, "Headphones", "Electronics", 100, 2, 10, "2023-08-03"),
    (6, 105, "Shoes", "Fashion", 60, 1, 20, "2023-08-04"),
    (7, 106, "Refrigerator", "Home Appliance", 800, 1, 25, "2023-08-05"),
    (8, 107, "Book", "Books", 20, 4, 0, "2023-08-05"),
    (9, 108, "Toaster", "Home Appliance", 30, 1, 5, "2023-08-06"),
    (10, 102, "Tablet", "Electronics", 300, 2, 10, "2023-08-06")
]
com_columns = ["transaction_id", "customer_id", "product", "category", "price", "quantity", "discount_percentage", "transaction_date"]
df = spark.createDataFrame(com_data, com_columns)


In [6]:
# 1. Calculate the Total Revenue per Category
df_with_revenue = df.withColumn("total_revenue", col("price") * col("quantity") * (1 - col("discount_percentage") / 100))
revenue_per_category = df_with_revenue.groupBy("category").agg(sum("total_revenue").alias("total_revenue"))
revenue_per_category.show()


# 2. Filter Transactions with a Discount Greater Than 10%
discount_gt_10 = df.filter(col("discount_percentage") > 10)
discount_gt_10.show()

# 3. Find the Most Expensive Product Sold
most_expensive_product_df = df.orderBy(col("price").desc()).limit(1)
most_expensive_product_df.show()

# 4. Calculate the Average Quantity of Products Sold per Category
avg_quantity_per_category = df.groupBy("category").agg(avg("quantity").alias("avg_quantity"))
avg_quantity_per_category.show()

# 5. Identify Customers Who Purchased More Than One Product
customers_multiple_products = df.groupBy("transaction_id", "customer_id").agg(count("product").alias("product_count")).filter(col("product_count") > 1)
customers_multiple_products.show()

# 6. Find the Top 3 Highest Revenue Transactions
transaction_revenue = df_with_revenue.groupBy("transaction_id").agg(sum("total_revenue").alias("transaction_revenue"))
top_3_transactions = transaction_revenue.orderBy(col("transaction_revenue").desc()).limit(3)
top_3_transactions.show()

# 7. Calculate the Total Number of Transactions per Day
transactions_per_day = df.groupBy("transaction_date").agg(count("transaction_id").alias("total_transactions"))
transactions_per_day.show()

# 8. Find the Customer Who Spent the Most Money
customer_spending = df_with_revenue.groupBy("customer_id").agg(sum("total_revenue").alias("total_spent"))
top_spender_df = customer_spending.orderBy(col("total_spent").desc()).limit(1)
top_spender_df.show()

# 9. Calculate the Average Discount Given per Product Category
avg_discount_per_category = df.groupBy("category").agg(avg("discount_percentage").alias("avg_discount"))
avg_discount_per_category.show()

# 10. Create a New Column for Final Price After Discount
df_with_final_price = df.withColumn("final_price", col("price") * (1 - col("discount_percentage") / 100))
df_with_final_price.show()



+--------------+-------------+
|      category|total_revenue|
+--------------+-------------+
|       Fashion|        168.0|
|   Electronics|       2950.0|
|Home Appliance|        756.0|
|         Books|         80.0|
+--------------+-------------+

+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+
|transaction_id|customer_id|     product|      category|price|quantity|discount_percentage|transaction_date|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+
|             4|        104|     Blender|Home Appliance|  150|       1|                 15|      2023-08-03|
|             6|        105|       Shoes|       Fashion|   60|       1|                 20|      2023-08-04|
|             7|        106|Refrigerator|Home Appliance|  800|       1|                 25|      2023-08-05|
+--------------+-----------+------------+--------------+-----+--------+-------------------+------

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("Banking Transactions") \
    .getOrCreate()


banking_data = [
    (1, 201, "Deposit", 5000, "2023-09-01"),
    (2, 202, "Withdrawal", 2000, "2023-09-01"),
    (3, 203, "Deposit", 3000, "2023-09-02"),
    (4, 201, "Withdrawal", 1500, "2023-09-02"),
    (5, 204, "Deposit", 10000, "2023-09-03"),
    (6, 205, "Withdrawal", 500, "2023-09-03"),
    (7, 202, "Deposit", 2500, "2023-09-04"),
    (8, 206, "Withdrawal", 700, "2023-09-04"),
    (9, 203, "Deposit", 4000, "2023-09-05"),
    (10, 204, "Withdrawal", 3000, "2023-09-05")
]
banking_columns = ["transaction_id", "customer_id", "transaction_type", "amount", "transaction_date"]
df = spark.createDataFrame(banking_data, banking_columns)

In [8]:
# 1. Calculate the Total Deposit and Withdrawal Amounts
total_by_type = df.groupBy("transaction_type").agg(sum("amount").alias("total_amount"))
total_by_type.show()

# 2. Filter Transactions Greater Than $3,000
transactions_gt_3000 = df.filter(col("amount") > 3000)
transactions_gt_3000.show()

# 3. Find the Largest Deposit Made
largest_deposit_df = df.filter(col("transaction_type") == "Deposit").orderBy(col("amount").desc()).limit(1)
largest_deposit_df.show()

# 4. Calculate the Average Transaction Amount for Each Transaction Type
avg_amount_by_type = df.groupBy("transaction_type").agg(avg("amount").alias("avg_amount"))
avg_amount_by_type.show()

# 5. Find Customers Who Made Both Deposits and Withdrawals
deposit_customers = df.filter(col("transaction_type") == "Deposit").select("customer_id").distinct()
withdrawal_customers = df.filter(col("transaction_type") == "Withdrawal").select("customer_id").distinct()
both_types_customers = deposit_customers.intersect(withdrawal_customers)
both_types_customers.show()

# 6. Calculate the Total Amount of Transactions per Day
total_per_day = df.groupBy("transaction_date").agg(sum("amount").alias("total_amount"))
total_per_day.show()

# 7. Find the Customer with the Highest Total Withdrawal
total_withdrawal_by_customer = df.filter(col("transaction_type") == "Withdrawal").groupBy("customer_id").agg(sum("amount").alias("total_withdrawn"))
highest_withdrawal_customer = total_withdrawal_by_customer.orderBy(col("total_withdrawn").desc()).limit(1)
highest_withdrawal_customer.show()

# 8. Calculate the Number of Transactions for Each Customer
transaction_count_per_customer = df.groupBy("customer_id").agg(count("transaction_id").alias("transaction_count"))
transaction_count_per_customer.show()

# 9. Find All Transactions That Occurred on the Same Day as a Withdrawal Greater Than $1,000
withdrawals_gt_1000 = df.filter((col("transaction_type") == "Withdrawal") & (col("amount") > 1000)).select("transaction_date").distinct()
transactions_same_day = df.join(withdrawals_gt_1000, on="transaction_date")
transactions_same_day.show()

# 10. Create a New Column to Classify Transactions as "High" or "Low" Value
df_with_classification = df.withColumn("transaction_value",
    when(col("amount") > 5000, "High").otherwise("Low"))
df_with_classification.show()


+----------------+------------+
|transaction_type|total_amount|
+----------------+------------+
|         Deposit|       24500|
|      Withdrawal|        7700|
+----------------+------------+

+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|
|             5|        204|         Deposit| 10000|      2023-09-03|
|             9|        203|         Deposit|  4000|      2023-09-05|
+--------------+-----------+----------------+------+----------------+

+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             5|        204|         Deposit| 10000|      2023-09-03|
+--------------+-----------+--------