# CASE STUDY - ONLINE BANKING ANALYSIS

## Import libraries & Initiate session

In [0]:
# initialize the session
from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext.getOrCreate()
spark = SparkSession.builder.appName('Case study program').getOrCreate()

## Upload dataset 

In [0]:
data_credit =spark.read.csv("/FileStore/tables/creditCard.csv",inferSchema=True,header=True)
data_txn =spark.read.csv("/FileStore/tables/txn.csv",inferSchema=True,header=True)
data_loan =spark.read.csv("/FileStore/tables/bankloan.csv",inferSchema=True,header=True)

## Exploring data

### Loan Data

In [0]:
# Print Schema
data_loan.printSchema()

root
 |-- Customer_ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Family Size: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Expenditure: integer (nullable = true)
 |-- Use Frequency: integer (nullable = true)
 |-- Loan Category: string (nullable = true)
 |-- Loan Amount: string (nullable = true)
 |-- Overdue: integer (nullable = true)
 |--  Debt Record: string (nullable = true)
 |--  Returned Cheque: integer (nullable = true)
 |--  Dishonour of Bill: integer (nullable = true)



In [0]:
# Display data
data_loan.limit(10).display()

Customer_ID,Age,Gender,Occupation,Marital Status,Family Size,Income,Expenditure,Use Frequency,Loan Category,Loan Amount,Overdue,Debt Record,Returned Cheque,Dishonour of Bill
IB14001,30,MALE,BANK MANAGER,SINGLE,4,50000,22199,6,HOUSING,1000000,5,42898,6,9
IB14008,44,MALE,PROFESSOR,MARRIED,6,51000,19999,4,SHOPPING,50000,3,33999,1,5
IB14012,30,FEMALE,DENTIST,SINGLE,3,58450,27675,5,TRAVELLING,75000,6,20876,3,1
IB14018,29,MALE,TEACHER,MARRIED,5,45767,12787,3,GOLD LOAN,600000,7,11000,0,4
IB14022,34,MALE,POLICE,SINGLE,4,43521,11999,3,AUTOMOBILE,200000,2,43898,1,2
IB14024,55,FEMALE,NURSE,MARRIED,6,34999,19888,4,AUTOMOBILE,47787,1,50000,0,3
IB14025,39,FEMALE,TEACHER,MARRIED,6,46619,18675,4,HOUSING,1209867,8,29999,6,8
IB14027,51,MALE,SYSTEM MANAGER,MARRIED,3,49999,19111,5,RESTAURANTS,60676,8,13000,2,5
IB14029,24,FEMALE,TEACHER,SINGLE,3,45008,17454,4,AUTOMOBILE,399435,9,51987,4,7
IB14031,37,FEMALE,SOFTWARE ENGINEER,MARRIED,5,55999,23999,5,AUTOMOBILE,60999,2,0,5,3


In [0]:
# Number of rows
num_rows1 = data_loan.count()
# Number of columns
num_columns1 = len(data_loan.columns)
print(f"Number of rows: {num_rows1}")
print(f"Number of columns: {num_columns1}")

Number of rows: 500
Number of columns: 15


In [0]:
# Data description
display(data_loan.describe())

summary,Customer_ID,Age,Gender,Occupation,Marital Status,Family Size,Income,Expenditure,Use Frequency,Loan Category,Loan Amount,Overdue,Debt Record,Returned Cheque,Dishonour of Bill
count,500,500.0,500,500,500,500.0,468.0,481.0,500.0,500,500.0,500.0,500.0,500.0,500.0
mean,,40.946,,,,4.55,68339.49145299145,27533.180873180874,5.33,,,4.974,46357.55223880597,4.12,4.31
stddev,,10.19288348542722,,,,1.54280929509842,86796.49367750238,10209.599414813816,2.0487789021707443,,,2.4919356907781443,22291.17849784667,2.777233524949641,2.6160250714296875
min,1B14093,21.0,FEMALE,ACCOUNT MANAGER,MARRIED,2.0,28366.0,9000.0,2.0,AGRICULTURE,100000.0,1.0,0.0,0.0,0.0
max,IBI4921,60.0,MALE,TECHNICIAN,SINGLE,7.0,930000.0,62541.0,9.0,TRAVELLING,999698.0,9.0,90000.0,9.0,10.0


In [0]:
from pyspark.sql.functions import col, sum, when, lit

# Calculate null counts
null_counts = data_loan.select(
    [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in data_loan.columns]
)
null_counts_dict = null_counts.collect()[0].asDict()
transposed_null_counts = spark.createDataFrame(
    [(key, value) for key, value in null_counts_dict.items()],
    schema=["Column", "Null Count"]
)
# Show the transposed DataFrame
transposed_null_counts.show()

+------------------+----------+
|            Column|Null Count|
+------------------+----------+
|       Customer_ID|         0|
|               Age|         0|
|            Gender|         0|
|        Occupation|         0|
|    Marital Status|         0|
|       Family Size|         0|
|            Income|        32|
|       Expenditure|        19|
|     Use Frequency|         0|
|     Loan Category|         0|
|       Loan Amount|         0|
|           Overdue|         0|
|       Debt Record|         0|
|   Returned Cheque|         0|
| Dishonour of Bill|         0|
+------------------+----------+



In [0]:
# Drop rows where any column has a NULL value
clean_loan = data_loan.dropna(how="any")

num_rows01 = data_loan.count()
num_rows02 = clean_loan.count()
print('With NULL')
print(f"Number of rows: {num_rows01}")
print("After Cleaning")
print(f"Number of rows: {num_rows02}")


With NULL
Number of rows: 500
After Cleaning
Number of rows: 449


### Transaction data

In [0]:
# Print Schema
data_txn.printSchema()

root
 |-- Account No: string (nullable = true)
 |-- TRANSACTION DETAILS: string (nullable = true)
 |-- VALUE DATE: string (nullable = true)
 |--  WITHDRAWAL AMT : double (nullable = true)
 |--  DEPOSIT AMT : double (nullable = true)
 |-- BALANCE AMT: double (nullable = true)



In [0]:
# Display data
data_txn.limit(10).display()

Account No,TRANSACTION DETAILS,VALUE DATE,WITHDRAWAL AMT,DEPOSIT AMT,BALANCE AMT
409000611074',TRF FROM Indiaforensic SERVICES,29-Jun-17,,1000000.0,1000000.0
409000611074',TRF FROM Indiaforensic SERVICES,5-Jul-17,,1000000.0,2000000.0
409000611074',FDRL/INTERNAL FUND TRANSFE,18-Jul-17,,500000.0,2500000.0
409000611074',TRF FRM Indiaforensic SERVICES,1-Aug-17,,3000000.0,5500000.0
409000611074',FDRL/INTERNAL FUND TRANSFE,16-Aug-17,,500000.0,6000000.0
409000611074',FDRL/INTERNAL FUND TRANSFE,16-Aug-17,,500000.0,6500000.0
409000611074',FDRL/INTERNAL FUND TRANSFE,16-Aug-17,,500000.0,7000000.0
409000611074',FDRL/INTERNAL FUND TRANSFE,16-Aug-17,,500000.0,7500000.0
409000611074',FDRL/INTERNAL FUND TRANSFE,16-Aug-17,,500000.0,8000000.0
409000611074',FDRL/INTERNAL FUND TRANSFE,16-Aug-17,,500000.0,8500000.0


In [0]:
# Number of rows
num_rows2 = data_txn.count()
# Number of columns
num_columns2 = len(data_txn.columns)
print(f"Number of rows: {num_rows2}")
print(f"Number of columns: {num_columns2}")

Number of rows: 116201
Number of columns: 6


In [0]:
# Data description
display(data_txn.describe())

summary,Account No,TRANSACTION DETAILS,VALUE DATE,WITHDRAWAL AMT,DEPOSIT AMT,BALANCE AMT
count,116201,113702,116201,53549.0,62652.0,116201.0
mean,,3.675022545399418E15,,4489189.943506325,3806585.828440277,-1404852040.9591005
stddev,,1.49342569853758688E17,,10848504.204717929,8683093.407864038,534820182.3458226
min,1196428',(SR1239979079) REJ INVALI,1-Apr-17,0.01,0.01,-2045201142.0
max,409000611074',ZEN LEFIN PVT LTD1702245A,9-Sep-16,459447546.4,544800000.0,8500000.0


In [0]:
# Calculate null counts
null_counts1 = data_txn.select(
    [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in data_txn.columns]
)
null_counts_dict1 = null_counts1.collect()[0].asDict()
transposed_null_counts1 = spark.createDataFrame(
    [(key, value) for key, value in null_counts_dict1.items()],
    schema=["Column", "Null Count"]
)
# Show the transposed DataFrame
transposed_null_counts1.show()

+-------------------+----------+
|             Column|Null Count|
+-------------------+----------+
|         Account No|         0|
|TRANSACTION DETAILS|      2499|
|         VALUE DATE|         0|
|    WITHDRAWAL AMT |     62652|
|       DEPOSIT AMT |     53549|
|        BALANCE AMT|         0|
+-------------------+----------+



In [0]:
# Drop rows where any column has a NULL value
clean_txn = data_txn.dropna(how="any")

num_rows11 = data_txn.count()
num_rows12 = clean_txn.count()
print('With NULL')
print(f"Number of rows: {num_rows11}")
print("After Cleaning")
print(f"Number of rows: {num_rows12}")

With NULL
Number of rows: 116201
After Cleaning
Number of rows: 0


In [0]:
# Fill missing values in specified columns with 0
txn_filled_num = data_txn.fillna({" WITHDRAWAL AMT ": 0.0, " DEPOSIT AMT ": 0.0})
txn_all_filled = txn_filled_num.fillna({"TRANSACTION DETAILS": "NA"})
# Show the updated DataFrame
txn_all_filled.limit(10).display()

Account No,TRANSACTION DETAILS,VALUE DATE,WITHDRAWAL AMT,DEPOSIT AMT,BALANCE AMT
409000611074',TRF FROM Indiaforensic SERVICES,29-Jun-17,0.0,1000000.0,1000000.0
409000611074',TRF FROM Indiaforensic SERVICES,5-Jul-17,0.0,1000000.0,2000000.0
409000611074',FDRL/INTERNAL FUND TRANSFE,18-Jul-17,0.0,500000.0,2500000.0
409000611074',TRF FRM Indiaforensic SERVICES,1-Aug-17,0.0,3000000.0,5500000.0
409000611074',FDRL/INTERNAL FUND TRANSFE,16-Aug-17,0.0,500000.0,6000000.0
409000611074',FDRL/INTERNAL FUND TRANSFE,16-Aug-17,0.0,500000.0,6500000.0
409000611074',FDRL/INTERNAL FUND TRANSFE,16-Aug-17,0.0,500000.0,7000000.0
409000611074',FDRL/INTERNAL FUND TRANSFE,16-Aug-17,0.0,500000.0,7500000.0
409000611074',FDRL/INTERNAL FUND TRANSFE,16-Aug-17,0.0,500000.0,8000000.0
409000611074',FDRL/INTERNAL FUND TRANSFE,16-Aug-17,0.0,500000.0,8500000.0


In [0]:
# Calculate null counts
null_counts4 = txn_all_filled.select(
    [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in txn_all_filled.columns]
)
null_counts_dict4 = null_counts4.collect()[0].asDict()
transposed_null_counts4 = spark.createDataFrame(
    [(key, value) for key, value in null_counts_dict4.items()],
    schema=["Column", "Null Count"]
)
# Show the transposed DataFrame
transposed_null_counts4.show()

+-------------------+----------+
|             Column|Null Count|
+-------------------+----------+
|         Account No|         0|
|TRANSACTION DETAILS|         0|
|         VALUE DATE|         0|
|    WITHDRAWAL AMT |         0|
|       DEPOSIT AMT |         0|
|        BALANCE AMT|         0|
+-------------------+----------+



### Credit data

In [0]:
# Print Schema
data_credit.printSchema()

root
 |-- RowNumber: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- IsActiveMember: integer (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)



In [0]:
# Display data
data_credit.limit(10).display()

RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Exited
1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,101348.88,1
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,1,112542.58,0
3,15619304,Onio,502,France,Female,42,8,159660.8,3,0,113931.57,1
4,15701354,Boni,699,France,Female,39,1,0.0,2,0,93826.63,0
5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,79084.1,0
6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,0,149756.71,1
7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,10062.8,0
8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,0,119346.88,1
9,15792365,He,501,France,Male,44,4,142051.07,2,1,74940.5,0
10,15592389,H?,684,France,Male,27,2,134603.88,1,1,71725.73,0


In [0]:
# Number of rows
num_rows3 = data_credit.count()
# Number of columns
num_columns3 = len(data_credit.columns)
print(f"Number of rows: {num_rows3}")
print(f"Number of columns: {num_columns3}")

Number of rows: 10000
Number of columns: 13


In [0]:
# Data description
display(data_credit.describe())

summary,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000,10000.0,10000,10000,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.5694,,650.5288,,,38.9218,5.0128,76485.88928799961,1.5302,0.5151,100090.2398809998,0.2037
stddev,2886.8956799071675,71936.18612274907,,96.65329873613037,,,10.487806451704587,2.892174377049684,62397.40520238599,0.5816543579989917,0.4997969284589181,57510.49281769821,0.4027685839948606
min,1.0,15565701.0,Abazu,350.0,France,Female,18.0,0.0,0.0,1.0,0.0,11.58,0.0
max,10000.0,15815690.0,Zuyeva,850.0,Spain,Male,92.0,10.0,250898.09,4.0,1.0,199992.48,1.0


In [0]:
# Calculate null counts
null_counts2 = data_credit.select(
    [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in data_credit.columns]
)
null_counts_dict2 = null_counts2.collect()[0].asDict()
transposed_null_counts2 = spark.createDataFrame(
    [(key, value) for key, value in null_counts_dict2.items()],
    schema=["Column", "Null Count"]
)
# Show the transposed DataFrame
transposed_null_counts2.show()

+---------------+----------+
|         Column|Null Count|
+---------------+----------+
|      RowNumber|         0|
|     CustomerId|         0|
|        Surname|         0|
|    CreditScore|         0|
|      Geography|         0|
|         Gender|         0|
|            Age|         0|
|         Tenure|         0|
|        Balance|         0|
|  NumOfProducts|         0|
| IsActiveMember|         0|
|EstimatedSalary|         0|
|         Exited|         0|
+---------------+----------+



# Question & Solution

## In loandata.csv file

### 1. number of loans in each category

In [0]:
data_loan.groupBy("Loan Category").count().show()


+------------------+-----+
|     Loan Category|count|
+------------------+-----+
|           HOUSING|   67|
|        TRAVELLING|   53|
|       BOOK STORES|    7|
|       AGRICULTURE|   12|
|         GOLD LOAN|   77|
|  EDUCATIONAL LOAN|   20|
|        AUTOMOBILE|   60|
|          BUSINESS|   24|
|COMPUTER SOFTWARES|   35|
|           DINNING|   14|
|          SHOPPING|   35|
|       RESTAURANTS|   41|
|       ELECTRONICS|   14|
|          BUILDING|    7|
|        RESTAURANT|   20|
|   HOME APPLIANCES|   14|
+------------------+-----+



In [0]:
clean_loan.groupBy("Loan Category").count().show()

+------------------+-----+
|     Loan Category|count|
+------------------+-----+
|           HOUSING|   61|
|        TRAVELLING|   48|
|       BOOK STORES|    7|
|       AGRICULTURE|   12|
|         GOLD LOAN|   72|
|  EDUCATIONAL LOAN|   17|
|        AUTOMOBILE|   53|
|          BUSINESS|   24|
|COMPUTER SOFTWARES|   25|
|           DINNING|   11|
|          SHOPPING|   30|
|       RESTAURANTS|   37|
|       ELECTRONICS|   13|
|          BUILDING|    6|
|        RESTAURANT|   20|
|   HOME APPLIANCES|   13|
+------------------+-----+



### 2. number of people who have taken more than 1 lack loan

In [0]:
from pyspark.sql.functions import col, regexp_replace
# since here , present in loan amount column we are replacing the comma 
# then cast it as integer
# Remove commas and cast the Loan Amount column to integer
loan_with_null_cast = data_loan.withColumn(
    "Loan Amount", 
    regexp_replace(col("Loan Amount"), ",", "").cast("int")
)
loan_with_null_cast.printSchema()
loan_02 = loan_with_null_cast.filter(col("Loan Amount") > 100000)
num_rows111 = loan_02.count()
print(f"Number of people taken more then 1 lakh in raw data: {num_rows111}")

root
 |-- Customer_ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Family Size: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Expenditure: integer (nullable = true)
 |-- Use Frequency: integer (nullable = true)
 |-- Loan Category: string (nullable = true)
 |-- Loan Amount: integer (nullable = true)
 |-- Overdue: integer (nullable = true)
 |--  Debt Record: string (nullable = true)
 |--  Returned Cheque: integer (nullable = true)
 |--  Dishonour of Bill: integer (nullable = true)

Number of rows: 450


In [0]:
from pyspark.sql.functions import col, regexp_replace
# since here , present in loan amount column we are replacing the comma 
# then cast it as integer
# Remove commas and cast the Loan Amount column to integer
loan_clean_cast = clean_loan.withColumn(
    "Loan Amount", 
    regexp_replace(col("Loan Amount"), ",", "").cast("int")
)
loan_clean_cast.printSchema()
loan_04 = loan_clean_cast.filter(col("Loan Amount") > 100000)
#display(loan_02)
num_rows112 = loan_04.count()
print(f"Number of people taken more then 1 lakh in clean data: {num_rows112}")

root
 |-- Customer_ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Family Size: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Expenditure: integer (nullable = true)
 |-- Use Frequency: integer (nullable = true)
 |-- Loan Category: string (nullable = true)
 |-- Loan Amount: integer (nullable = true)
 |-- Overdue: integer (nullable = true)
 |--  Debt Record: string (nullable = true)
 |--  Returned Cheque: integer (nullable = true)
 |--  Dishonour of Bill: integer (nullable = true)

Number of rows: 409


### 3. number of people with income greater than 60000 rupees

In [0]:
loan_05 = data_loan.filter(col("Income") > 60000)
num_rows114 = loan_05.count()
print(f"Number of people with income greater than 60000 rupees on raw data: {num_rows114}")

Number of people with income greater than 60000 rupees: 198


In [0]:
loan_06 = clean_loan.filter(col("Income") > 60000)
num_rows115 = loan_06.count()
print(f"Number of people with income greater than 60000 rupees on clean data: {num_rows115}")

Number of people with income greater than 60000 rupees on clean data: 192


### 4. number of people with 2 or more returned cheques and income less than 50000

In [0]:
loan_07 = data_loan.filter((col(" Returned Cheque") >=2)&(col("Income") <50000))
num_rows007 = loan_07.count()
print(f"No of people with returned cheq>=2 & salary<50000data (raw): {num_rows007}")

No of people with returned cheq>=2 & salary<50000data (raw): 137


In [0]:
loan_08 = clean_loan.filter((col(" Returned Cheque") >=2)&(col("Income") <50000))
num_rows008 = loan_08.count()
print(f"No of people with returned cheq>=2 & salary<50000data (clean): {num_rows008}")

No of people with returned cheq>=2 & salary<50000data (clean): 132


### 5. number of people with 2 or more returned cheques and are single

In [0]:
loan_09 = data_loan.filter((col(" Returned Cheque") >= 2) & (col("Marital Status") == "Single"))
num_rows009 = loan_09.count()
print(f"No of people with returned cheq>=2 & single: {num_rows009}")

No of people with returned cheq>=2 & single: 0


### 6. number of people with expenditure over 50000 a month

In [0]:
loan_10 = data_loan.filter(col("Expenditure") >50000)
num_rows010 = loan_10.count()
print(f"No of people with expenditure>50000 (raw): {num_rows010}")

No of people with expenditure>50000 (raw): 6


In [0]:
loan_11 = clean_loan.filter(col("Expenditure") >50000)
num_rows011 = loan_11.count()
print(f"No of people with expenditure>50000(clean): {num_rows011}")

No of people with expenditure>50000(clean): 6


### 7. number of members who are elgible for credit card

In [0]:
eligible_customers1 = data_loan.filter(
    (col("Income") > 20000) &  
    (col(" Returned Cheque") == 0) &  # No returned cheques
    (col(" Dishonour of Bill") == 0)
)
# Count the number of eligible members
eligible_count1 = eligible_customers1.count()
print(f"No of people eligible for loan (raw): {eligible_count1}")

3


In [0]:
eligible_customers2 = clean_loan.filter(
    (col("Income") > 20000) &  
    (col(" Returned Cheque") == 0) &  
    (col(" Dishonour of Bill") == 0)
)
# Count the number of eligible members
eligible_count2 = eligible_customers2.count()
print(f"No of people eligible for loan (clean): {eligible_count2}")

No of people eligible for loan (clean): 2


## In credit.csv file

### 1. credit card users in Spain

In [0]:
credit_01 = data_credit.filter(col("Geography") =='Spain')
count01 = credit_01.count()
print(f"No of Credit card users in Spain: {count01}")

No of Credit card users in Spain: 2477


### 2. number of members who are elgible and active in the bank

In [0]:
#Works based on certain assumptions
eligible_active_customers = data_credit.filter(
    (col("CreditScore") >= 600) &        # Credit score threshold
    (col("Balance") > 0) &              # Non-zero balance
    (col("EstimatedSalary") >= 20000) & # Minimum salary threshold
    (col("Exited") == 0) &              # Customer has not exited
    (col("IsActiveMember") == 1)        # Customer is active
)

# Count the number of eligible and active customers
eligible_active_count = eligible_active_customers.count()

print(f"Number of eligible and active customers: {eligible_active_count}")

Number of eligible and active customers: 1803


##  In Transactions file

### 1. Maximum withdrawal amount in transactions Minimum withdrawal amount of an account

In [0]:
from pyspark.sql.functions import col, max, min

# Use PySpark's max and min functions correctly
withdrawal_stats = txn_all_filled.agg(
    max(col(" WITHDRAWAL AMT ")).alias("MaxWithdrawal"),
    min(col(" WITHDRAWAL AMT ")).alias("MinWithdrawal")
)
display(withdrawal_stats)

MaxWithdrawal,MinWithdrawal
459447546.4,0.0


### 2. maximum deposit amount of an account

In [0]:
from pyspark.sql.functions import col, max, min

# Use PySpark's max and min functions correctly
deposit_stats = txn_all_filled.agg(
    max(col(" DEPOSIT AMT ")).alias("MaxDeposit"),
)
display(deposit_stats)

MaxDeposit
544800000.0


### 3. minimum deposit amount of an account

In [0]:
from pyspark.sql.functions import col, max, min

# Use PySpark's max and min functions correctly
deposit_stats1 = txn_all_filled.agg(
    min(col(" DEPOSIT AMT ")).alias("MinDeposit"),
)
display(deposit_stats1)

MinDeposit
0.0


### 4. sum of balance in every bank account

In [0]:
from pyspark.sql.functions import col, sum

# Group by "Account No" and calculate the sum of "BALANCE AMT"
balance_sum = txn_all_filled.groupBy("Account No").agg(
    sum(col("BALANCE AMT")).alias("TotalBalance")
)
display(balance_sum)

Account No,TotalBalance
409000438611',-2494865770683.3955
1196711',-16047649810127.5
1196428',-81418498130721.0
409000493210',-3275849521320.957
409000611074',1615533622.0
409000425051',-3772118411.649988
409000405747',-24310804706.700016
409000493201',1042083182.9499984
409000438620',-7122918679513.586
409000362497',-52860004792808.0


### 5. Number of transaction on each date

In [0]:
from pyspark.sql.functions import col, count

# Group by "VALUE DATE" and calculate the COUNT
txn_c = txn_all_filled.groupBy("VALUE DATE").    count().alias("Transaction count")
display(txn_c)

VALUE DATE,count
23-Dec-16,143
7-Feb-19,98
21-Jul-15,80
9-Sep-15,91
17-Jan-15,16
18-Nov-17,53
21-Feb-18,77
20-Mar-18,71
19-Apr-18,71
21-Jun-16,97


### 6. List of customers with withdrawal amount more than 1 lakh

In [0]:
cust = txn_all_filled.filter(col(" WITHDRAWAL AMT ") >100000.0)
display(cust)

Account No,TRANSACTION DETAILS,VALUE DATE,WITHDRAWAL AMT,DEPOSIT AMT,BALANCE AMT
409000611074',INDO GIBL Indiaforensic STL01071,16-Aug-17,133900.0,0.0,8366100.0
409000611074',INDO GIBL Indiaforensic STL04071,16-Aug-17,195800.0,0.0,8147300.0
409000611074',INDO GIBL Indiaforensic STL10071,16-Aug-17,143800.0,0.0,7781600.0
409000611074',INDO GIBL Indiaforensic STL11071,16-Aug-17,331650.0,0.0,7449950.0
409000611074',INDO GIBL Indiaforensic STL12071,16-Aug-17,129000.0,0.0,7320950.0
409000611074',INDO GIBL Indiaforensic STL13071,16-Aug-17,230013.0,0.0,7090937.0
409000611074',INDO GIBL Indiaforensic STL14071,16-Aug-17,367900.0,0.0,6723037.0
409000611074',INDO GIBL Indiaforensic STL15071,16-Aug-17,108000.0,0.0,6615037.0
409000611074',INDO GIBL Indiaforensic STL17071,16-Aug-17,141000.0,0.0,6409237.0
409000611074',INDO GIBL Indiaforensic STL22071,16-Aug-17,206000.0,0.0,5959817.0
