<a href="https://colab.research.google.com/github/TanishqLambhate/Data-Science-Training/blob/pyspark_coding_challenge/Banking_Transaction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=0a1b6c4e47b66d943b1ed899ae2e30748a33b3e4fd56e4ca5274ce909764610e
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark=SparkSession.builder.appName('Coding Challenge').getOrCreate()
csv_path="/content/Banking_Transaction.csv"
df=spark.read.csv(csv_path,header=True,inferSchema=True)
df.show()

+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|
|             2|        202|      Withdrawal|  2000|      2023-09-01|
|             3|        203|         Deposit|  3000|      2023-09-02|
|             4|        201|      Withdrawal|  1500|      2023-09-02|
|             5|        204|         Deposit| 10000|      2023-09-03|
|             6|        205|      Withdrawal|   500|      2023-09-03|
|             7|        202|         Deposit|  2500|      2023-09-04|
|             8|        206|      Withdrawal|   700|      2023-09-04|
|             9|        203|         Deposit|  4000|      2023-09-05|
|            10|        204|      Withdrawal|  3000|      2023-09-05|
+--------------+-----------+----------------+------+----------------+



In [12]:
# 1. Calculate the Total Deposit and Withdrawal Amounts
# Group the data by transaction_type and calculate the total amounts for
# both deposits and withdrawals.
df.groupBy("transaction_type").agg(sum("amount").alias("total_amount")).show()

# 2. Filter Transactions Greater Than $3,000
# Filter the dataset to show only transactions where the amount is
# greater than $3,000.
df.filter(col("amount") > 3000).show()

# 3. Find the Largest Deposit Made
# Identify the transaction with the highest deposit amount.
largest_deposit=df.filter(col("transaction_type") == "Deposit").orderBy(col("amount").desc()).first()
print("The largest deposit made :")
print(largest_deposit)

# 4. Calculate the Average Transaction Amount for Each Transaction Type
# Group the data by transaction_type and calculate the average amount for
# deposits and withdrawals.
df.groupBy("transaction_type").agg(avg("amount").alias("average_amount")).show()

# 5. Find Customers Who Made Both Deposits and Withdrawals
# Identify customers who have made at least one deposit and one
# withdrawal.
df_deposits=df.filter(col("transaction_type") == "Deposit")
df_withdrawals=df.filter(col("transaction_type") == "Withdrawal")
customers_with_both=df_deposits.select("customer_id").intersect(df_withdrawals.select("customer_id"))
print("Customers who made both deposits and withdrawals :")
customers_with_both.show()

+----------------+------------+
|transaction_type|total_amount|
+----------------+------------+
|         Deposit|       24500|
|      Withdrawal|        7700|
+----------------+------------+

+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|
|             5|        204|         Deposit| 10000|      2023-09-03|
|             9|        203|         Deposit|  4000|      2023-09-05|
+--------------+-----------+----------------+------+----------------+

The largest deposit made :
Row(transaction_id=5, customer_id=204, transaction_type='Deposit', amount=10000, transaction_date=datetime.date(2023, 9, 3))
+----------------+--------------+
|transaction_type|average_amount|
+----------------+--------------+
|         Deposit|        4900.0|
|      Withdrawal|        15

In [17]:
# 6. Calculate the Total Amount of Transactions per Day
# Group the data by transaction_date and calculate the total amount of
# all transactions for each day.
df.groupBy("transaction_date").agg(sum("amount").alias("total_amount")).show()

# 7. Find the Customer with the Highest Total Withdrawal
# Calculate the total amount withdrawn by each customer and identify the
# customer with the highest total withdrawal.
df_total=df.filter(col("transaction_type") == "Withdrawal").groupBy("customer_id").agg(sum("amount").alias("total_amount"))
highest_withdrawal=df_total.orderBy(col("total_amount").desc()).first()
print("The customer with the highest total withdrawal :")
print(highest_withdrawal)

# 8. Calculate the Number of Transactions for Each Customer
# Group the data by customer_id and calculate the total number of
# transactions made by each customer.
df.groupBy("customer_id").agg(count("*").alias("total_transactions")).show()

# 9. Find All Transactions That Occurred on the Same Day as a Withdrawal Greater
# Than $1,000
# Filter the data to show all transactions that occurred on the same day
# as a withdrawal of more than $1,000.
df_withdrawal=df.filter(col("transaction_type") == "Withdrawal")
df_withdrawal=df_withdrawal.filter(col("amount") > 1000)
df_withdrawal.show()

# 10. Create a New Column to Classify Transactions as "High" or "Low" Value
# Add a new column transaction_value that classifies a transaction as
# "High" if the amount is greater than $5,000, otherwise classify it as
# "Low."
df=df.withColumn("transaction_value", when(col("amount") > 5000, "High").otherwise("Low"))
df.show()


+----------------+------------+
|transaction_date|total_amount|
+----------------+------------+
|      2023-09-03|       10500|
|      2023-09-01|        7000|
|      2023-09-05|        7000|
|      2023-09-02|        4500|
|      2023-09-04|        3200|
+----------------+------------+

The customer with the highest total withdrawal :
Row(customer_id=204, total_amount=3000)
+-----------+------------------+
|customer_id|total_transactions|
+-----------+------------------+
|        206|                 1|
|        205|                 1|
|        202|                 2|
|        203|                 2|
|        204|                 2|
|        201|                 2|
+-----------+------------------+

+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             2|        202|      Withdrawal|  2000|      2023-09-01|
|          