## Creating Spark Session and Load Datasets

In [None]:
from pyspark.sql import SparkSession
from google.colab import drive
spark = SparkSession.builder.appName('OnlineBankingAnalysis').getOrCreate()
drive.mount('/content/drive')
loan_df = spark.read.csv('/content/drive/MyDrive/case_study_dataset/loan.csv',header=True,inferSchema=True)
credit_df = spark.read.csv('/content/drive/MyDrive/case_study_dataset/credit_card.csv',header=True,inferSchema=True)
txn_df = spark.read.csv('/content/drive/MyDrive/case_study_dataset/txn.csv',header=True,inferSchema=True)
loan_df.show(5)
credit_df.show(5)
txn_df.show(5)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|  Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|    IB14001| 30|  MALE|BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|      HOUSING| 10,00,000 |      5|      42,898|               6|                 9|
|    IB14008| 44|  MALE|   PROFESSOR|       MARRIED|          6| 51000|      19999|            4|     SHOPPING|     50,000|      3|      33,999|   

In [None]:
loan_df.printSchema()
credit_df.printSchema()
txn_df.printSchema()

root
 |-- Customer_ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Family Size: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Expenditure: integer (nullable = true)
 |-- Use Frequency: integer (nullable = true)
 |-- Loan Category: string (nullable = true)
 |-- Loan Amount: string (nullable = true)
 |-- Overdue: integer (nullable = true)
 |--  Debt Record: string (nullable = true)
 |--  Returned Cheque: integer (nullable = true)
 |--  Dishonour of Bill: integer (nullable = true)

root
 |-- RowNumber: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: dou

## Cleaning the Dataset Before Analyzing

### Clean loan data

In [None]:

from pyspark.sql.functions import regexp_replace, col
# renaming column name for Debt record and returned cheque since it has trailing spaces at first
loan_df = loan_df.withColumnRenamed(" Returned Cheque", "Returned Cheque")
loan_df = loan_df.withColumnRenamed(" Debt Record", "Debt Record")

loan_df = loan_df.withColumn("Loan Amount", regexp_replace("Loan Amount", ",", "").cast("double"))
loan_df = loan_df.withColumn("Income", regexp_replace("Income", ",", "").cast("double"))
loan_df = loan_df.withColumn("Expenditure", regexp_replace("Expenditure", ",", "").cast("double"))
loan_df = loan_df.withColumn("Returned Cheque", col("Returned Cheque").cast("int"))
loan_df = loan_df.withColumn("Debt Record", regexp_replace("Debt Record", ",", "").cast("double"))

loan_df.show(5)


+-----------+---+------+------------+--------------+-----------+-------+-----------+-------------+-------------+-----------+-------+-----------+---------------+------------------+
|Customer_ID|Age|Gender|  Occupation|Marital Status|Family Size| Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue|Debt Record|Returned Cheque| Dishonour of Bill|
+-----------+---+------+------------+--------------+-----------+-------+-----------+-------------+-------------+-----------+-------+-----------+---------------+------------------+
|    IB14001| 30|  MALE|BANK MANAGER|        SINGLE|          4|50000.0|    22199.0|            6|      HOUSING|  1000000.0|      5|    42898.0|              6|                 9|
|    IB14008| 44|  MALE|   PROFESSOR|       MARRIED|          6|51000.0|    19999.0|            4|     SHOPPING|    50000.0|      3|    33999.0|              1|                 5|
|    IB14012| 30|FEMALE|     DENTIST|        SINGLE|          3|58450.0|    27675.0|            5|  

#### Clean transaction data

In [None]:
txn_df = txn_df.withColumnRenamed(" WITHDRAWAL AMT ", "WITHDRAWAL AMT")
txn_df = txn_df.withColumnRenamed(" DEPOSIT AMT ", "DEPOSIT AMT")
txn_df = txn_df.withColumnRenamed(" BALANCE AMT", "BALANCE AMT")

txn_df.filter(col("WITHDRAWAL AMT").isNull()).count()

62652

In [None]:
txn_df = txn_df.fillna({"WITHDRAWAL AMT": 0.0, "DEPOSIT AMT": 0.0})
txn_df.show()

+-------------+--------------------+----------+--------------+-----------+-----------+
|   Account No| TRANSACTION DETAILS|VALUE DATE|WITHDRAWAL AMT|DEPOSIT AMT|BALANCE AMT|
+-------------+--------------------+----------+--------------+-----------+-----------+
|409000611074'|TRF FROM  Indiafo...| 29-Jun-17|           0.0|  1000000.0|  1000000.0|
|409000611074'|TRF FROM  Indiafo...|  5-Jul-17|           0.0|  1000000.0|  2000000.0|
|409000611074'|FDRL/INTERNAL FUN...| 18-Jul-17|           0.0|   500000.0|  2500000.0|
|409000611074'|TRF FRM  Indiafor...|  1-Aug-17|           0.0|  3000000.0|  5500000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug-17|           0.0|   500000.0|  6000000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug-17|           0.0|   500000.0|  6500000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug-17|           0.0|   500000.0|  7000000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug-17|           0.0|   500000.0|  7500000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug

### In Loan CSV


#### Number of loans in each category

In [None]:
loan_df.groupBy("Loan Category").count().show()

+------------------+-----+
|     Loan Category|count|
+------------------+-----+
|           HOUSING|   67|
|        TRAVELLING|   53|
|       BOOK STORES|    7|
|       AGRICULTURE|   12|
|         GOLD LOAN|   77|
|  EDUCATIONAL LOAN|   20|
|        AUTOMOBILE|   60|
|          BUSINESS|   24|
|COMPUTER SOFTWARES|   35|
|           DINNING|   14|
|          SHOPPING|   35|
|       RESTAURANTS|   41|
|       ELECTRONICS|   14|
|          BUILDING|    7|
|        RESTAURANT|   20|
|   HOME APPLIANCES|   14|
+------------------+-----+



#### Number of people who have taken more than 1 lakh loan

In [None]:
from pyspark.sql.functions import col
loan_df.filter(col("Loan Amount") > 100000).count()

450

#### Number of people with income greater than 60000 rupees

In [None]:
loan_df.filter(col("Income")>60000).count()

198

#### Number of people with 2 or more returned cheques and income less than 50000

In [None]:
loan_df.filter((col("Returned Cheque")>=2) & (col("Income")<50000)).count()

137

#### Number of people with 2 or more returned cheques and are single

In [None]:
loan_df.filter((col("Returned Cheque")>=2) & (col("Marital Status")== "SINGLE")).count()

111

#### Number of people with expenditure over 50000 a month

In [None]:
loan_df.filter(col("Expenditure")>50000).count()

6

#### Number of members who are elgible for credit card

Assuming 50000 as a eligible criteria for credit card

In [None]:
loan_df.filter(col("Debt Record")<50000).count()

331

## Credit Data

In [None]:
credit_df.printSchema()

root
 |-- RowNumber: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- IsActiveMember: integer (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)



### Credit card users in Spain

In [None]:
credit_df.filter(col("Geography")=="Spain").count()

2477

### Number of members who are eligible and active in the bank

Assuming 650 as eligiblity Criteria for credit score

In [None]:
credit_df.filter((col("IsActiveMember")==1)&(col("CreditScore")>650)).count()

2655

## Transaction Data

In [None]:
txn_df.printSchema()

root
 |-- Account No: string (nullable = true)
 |-- TRANSACTION DETAILS: string (nullable = true)
 |-- VALUE DATE: string (nullable = true)
 |-- WITHDRAWAL AMT: double (nullable = false)
 |-- DEPOSIT AMT: double (nullable = false)
 |-- BALANCE AMT: double (nullable = true)



#### Maximum withdrawal amount in transactions

In [None]:
from pyspark.sql.functions import max
txn_df.agg(max("WITHDRAWAL AMT")).show()

+-------------------+
|max(WITHDRAWAL AMT)|
+-------------------+
|      4.594475464E8|
+-------------------+



#### Minimum withdrawl amount of an account


In [None]:
from pyspark.sql.functions import min
txn_df.agg(min("WITHDRAWAL AMT")).show()

+-------------------+
|min(WITHDRAWAL AMT)|
+-------------------+
|                0.0|
+-------------------+



#### Maximum deposit amount of an account


In [None]:
from pyspark.sql.functions import max
txn_df.agg(max("DEPOSIT AMT")).show()

+----------------+
|max(DEPOSIT AMT)|
+----------------+
|         5.448E8|
+----------------+



#### Minimum Deposit amount of an account


In [None]:
from pyspark.sql.functions import min
txn_df.agg(min("DEPOSIT AMT")).show()

+----------------+
|min(DEPOSIT AMT)|
+----------------+
|             0.0|
+----------------+



#### Sum of balance in every bank account

In [None]:
from pyspark.sql.functions import sum
txn_df.groupBy("Account No").agg(sum("BALANCE AMT").alias("Total Balance")).show()

+-------------+--------------------+
|   Account No|       Total Balance|
+-------------+--------------------+
|409000438611'|-2.49486577068339...|
|     1196711'|-1.60476498101275E13|
|     1196428'| -8.1418498130721E13|
|409000493210'|-3.27584952132095...|
|409000611074'|       1.615533622E9|
|409000425051'|-3.77211841164998...|
|409000405747'|-2.43108047067000...|
|409000362497'| -5.2860004792808E13|
|409000493201'|1.0420831829499985E9|
|409000438620'|-7.12291867951358...|
+-------------+--------------------+



#### Number of transaction on each date

In [None]:
txn_df.groupBy("VALUE DATE").count().show()

+----------+-----+
|VALUE DATE|count|
+----------+-----+
| 23-Dec-16|  143|
|  7-Feb-19|   98|
| 21-Jul-15|   80|
|  9-Sep-15|   91|
| 17-Jan-15|   16|
| 18-Nov-17|   53|
| 21-Feb-18|   77|
| 20-Mar-18|   71|
| 19-Apr-18|   71|
| 21-Jun-16|   97|
| 17-Oct-17|  101|
|  3-Jan-18|   70|
|  8-Jun-18|  223|
| 15-Dec-18|   62|
|  8-Aug-16|   97|
| 17-Dec-16|   74|
|  3-Sep-15|   83|
| 21-Jan-16|   76|
|  4-May-18|   92|
|  7-Sep-17|   94|
+----------+-----+
only showing top 20 rows



#### List of customers with withdrawal amount more than 1 lakh

In [None]:
txn_df.filter((col("WITHDRAWAL AMT")>100000)).select("Account No","WITHDRAWAL AMT").distinct().show()

+-------------+--------------+
|   Account No|WITHDRAWAL AMT|
+-------------+--------------+
|409000611074'|      274600.0|
|409000493201'|     1500000.0|
|409000493201'|     199604.27|
|409000438620'|      186604.0|
|409000438620'|   3.6675558E7|
|     1196711'|     7530283.0|
|     1196428'|      812361.0|
|     1196428'|     6348768.0|
|     1196428'|    3043151.63|
|409000362497'|      576954.0|
|409000362497'|     3423962.0|
|409000362497'| 3.144482503E7|
|     1196428'|    4441827.47|
|409000611074'|      145450.0|
|409000493201'|     119401.28|
|     1196711'|      628945.0|
|     1196428'|     289670.04|
|409000362497'| 3.483281361E7|
|409000362497'| 4.289763641E7|
|409000362497'| 2.678162613E7|
+-------------+--------------+
only showing top 20 rows

