In [1]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=63400b1a2ad96bc8db0c34ce948b362a6c73729cd43ecc86a40b4ebc06adabe4
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
  .appName("BankingTransactions")\
  .getOrCreate() 

In [4]:
#1. Calculate the Total Deposit and Withdrawal Amounts
from pyspark.sql.functions import col, sum
df_transactions = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/content/sample_data/banking_data.csv")
total_amounts_by_type = df_transactions.groupBy("transaction_type").agg(sum("amount").alias("total_amount"))
total_amounts_by_type.show()

+----------------+------------+
|transaction_type|total_amount|
+----------------+------------+
|         Deposit|       24500|
|      Withdrawal|        7700|
+----------------+------------+



In [5]:
#2. Filter Transactions Greater Than $3,000
df_high_amount_transactions = df_transactions.filter(col("amount") > 3000)
df_high_amount_transactions.show()

+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|
|             5|        204|         Deposit| 10000|      2023-09-03|
|             9|        203|         Deposit|  4000|      2023-09-05|
+--------------+-----------+----------------+------+----------------+



In [6]:
#3. Find the Largest Deposit Made
largest_deposit = df_transactions.filter(col("transaction_type") == "Deposit").orderBy(col("amount").desc()).first()
print(largest_deposit)

Row(transaction_id=5, customer_id=204, transaction_type='Deposit', amount=10000, transaction_date=datetime.date(2023, 9, 3))


In [7]:
#4. Calculate the Average Transaction Amount for Each Transaction Type
from pyspark.sql.functions import avg
average_amount_by_type = df_transactions.groupBy("transaction_type").agg(avg("amount").alias("average_amount"))
average_amount_by_type.show()

+----------------+--------------+
|transaction_type|average_amount|
+----------------+--------------+
|         Deposit|        4900.0|
|      Withdrawal|        1540.0|
+----------------+--------------+



In [8]:
#5. Find Customers Who Made Both Deposits and Withdrawals
deposit_customers = df_transactions.filter(col("transaction_type") == "Deposit").select("customer_id").distinct()
withdrawal_customers = df_transactions.filter(col("transaction_type") == "Withdrawal").select("customer_id").distinct()
customers_with_both = deposit_customers.intersect(withdrawal_customers)
customers_with_both.show()

+-----------+
|customer_id|
+-----------+
|        202|
|        204|
|        201|
+-----------+



In [9]:
#6. Calculate the Total Amount of Transactions per Day
total_amount_per_day = df_transactions.groupBy("transaction_date").agg(sum("amount").alias("total_amount"))
total_amount_per_day.show()

+----------------+------------+
|transaction_date|total_amount|
+----------------+------------+
|      2023-09-03|       10500|
|      2023-09-01|        7000|
|      2023-09-05|        7000|
|      2023-09-02|        4500|
|      2023-09-04|        3200|
+----------------+------------+



In [10]:
#7. Find the Customer with the Highest Total Withdrawal
total_withdrawal_per_customer = df_transactions.filter(col("transaction_type") == "Withdrawal").groupBy("customer_id").agg(sum("amount").alias("total_withdrawal"))
highest_withdrawer = total_withdrawal_per_customer.orderBy(col("total_withdrawal").desc()).first()
print(highest_withdrawer)

Row(customer_id=204, total_withdrawal=3000)


In [11]:
#8. Calculate the Number of Transactions for Each Customer
from pyspark.sql.functions import count
transactions_per_customer = df_transactions.groupBy("customer_id").agg(count("transaction_id").alias("transaction_count"))
transactions_per_customer.show()

+-----------+-----------------+
|customer_id|transaction_count|
+-----------+-----------------+
|        206|                1|
|        205|                1|
|        202|                2|
|        203|                2|
|        204|                2|
|        201|                2|
+-----------+-----------------+



In [12]:
#9. Find All Transactions That Occurred on the Same Day as a Withdrawal Greater Than $1,000
withdrawals_gt_1000 = df_transactions.filter((col("transaction_type") == "Withdrawal") & (col("amount") > 1000)).select("transaction_date").distinct()
all_transactions_on_same_days = df_transactions.join(withdrawals_gt_1000, on="transaction_date")
all_transactions_on_same_days.show()

+----------------+--------------+-----------+----------------+------+
|transaction_date|transaction_id|customer_id|transaction_type|amount|
+----------------+--------------+-----------+----------------+------+
|      2023-09-01|             1|        201|         Deposit|  5000|
|      2023-09-01|             2|        202|      Withdrawal|  2000|
|      2023-09-02|             3|        203|         Deposit|  3000|
|      2023-09-02|             4|        201|      Withdrawal|  1500|
|      2023-09-05|             9|        203|         Deposit|  4000|
|      2023-09-05|            10|        204|      Withdrawal|  3000|
+----------------+--------------+-----------+----------------+------+



In [13]:
#10. Create a New Column to Classify Transactions as "High" or "Low" Value
from pyspark.sql.functions import when
df_transactions = df_transactions.withColumn("transaction_value", when(col("amount") > 5000, "High").otherwise("Low"))
df_transactions.show()

+--------------+-----------+----------------+------+----------------+-----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|transaction_value|
+--------------+-----------+----------------+------+----------------+-----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|              Low|
|             2|        202|      Withdrawal|  2000|      2023-09-01|              Low|
|             3|        203|         Deposit|  3000|      2023-09-02|              Low|
|             4|        201|      Withdrawal|  1500|      2023-09-02|              Low|
|             5|        204|         Deposit| 10000|      2023-09-03|             High|
|             6|        205|      Withdrawal|   500|      2023-09-03|              Low|
|             7|        202|         Deposit|  2500|      2023-09-04|              Low|
|             8|        206|      Withdrawal|   700|      2023-09-04|              Low|
|             9|        203|    