In [None]:
# Loading of datasets
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Loading and Analysing Data").getOrCreate()
credit_cardDF = spark.read.csv("/content/credit card.csv", header = True, inferSchema = True)
loanDF = spark.read.csv("/content/loan.csv", header = True, inferSchema = True)
txnDF = spark.read.csv("/content/txn.csv", header = True, inferSchema = True)

In [None]:
# Number of loans in each category
loanDF.groupBy("Loan Category").count().show()

+------------------+-----+
|     Loan Category|count|
+------------------+-----+
|           HOUSING|   67|
|        TRAVELLING|   53|
|       BOOK STORES|    7|
|       AGRICULTURE|   12|
|         GOLD LOAN|   77|
|  EDUCATIONAL LOAN|   20|
|        AUTOMOBILE|   60|
|          BUSINESS|   24|
|COMPUTER SOFTWARES|   35|
|           DINNING|   14|
|          SHOPPING|   35|
|       RESTAURANTS|   41|
|       ELECTRONICS|   14|
|          BUILDING|    7|
|        RESTAURANT|   20|
|   HOME APPLIANCES|   14|
+------------------+-----+



In [None]:
from pyspark.sql.functions import regexp_replace, col

loanDF = loanDF.withColumn("Loan_Amount",
            regexp_replace(col("Loan Amount"), ",", "").cast("int"))

In [None]:
# Number of people who have taken more than 1 lack loan
loanDF.where(loanDF["Loan_Amount"] > 100000).count()

450

In [None]:
# Number of people with income greater than 60000 rupees
loanDF.where(loanDF["Income"] > 50000).count()

284

In [None]:
# Number of people with 2 or more returned cheques and income less than 50000
loanDF.filter((loanDF[" Returned Cheque"] >= 2) & (loanDF["Income"] < 50000)).count()

137

In [None]:
# Number of people with 2 or more returned cheques and are single
loanDF.filter((loanDF[" Returned Cheque"] >= 2) & (loanDF["Marital Status"] == "SINGLE")).count()

111

In [None]:
# Number of people with expenditure over 50000 a month
loanDF.filter(loanDF["Expenditure"] > 50000).count()

6

In [38]:
# Credit card users in Spain
credit_cardDF.filter(credit_cardDF["Geography"] == "Spain").show()

+---------+----------+---------+-----------+---------+------+---+------+---------+-------------+--------------+---------------+------+
|RowNumber|CustomerId|  Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+---------+-----------+---------+------+---+------+---------+-------------+--------------+---------------+------+
|        2|  15647311|     Hill|        608|    Spain|Female| 41|     1| 83807.86|            1|             1|      112542.58|     0|
|        5|  15737888| Mitchell|        850|    Spain|Female| 43|     2|125510.82|            1|             1|        79084.1|     0|
|        6|  15574012|      Chu|        645|    Spain|  Male| 44|     8|113755.78|            2|             0|      149756.71|     1|
|       12|  15737173|  Andrews|        497|    Spain|  Male| 24|     3|      0.0|            2|             0|       76390.01|     0|
|       15|  15600882|    Scott|        635|    Spain|F

In [42]:
# Number of members who are eligible and active in the bank
credit_cardDF.filter(credit_cardDF["IsActiveMember"] == 1).count()

5151

In [53]:
# Maximum withdrawal amount in transactions
from pyspark.sql.functions import max
txnDF.select(max(" WITHDRAWAL AMT ").alias("Max Withdrawal Amount").cast("long")).show()

+---------------------+
|Max Withdrawal Amount|
+---------------------+
|            459447546|
+---------------------+



In [54]:
# Minimum withdrawal amount in transactions
from pyspark.sql.functions import min
txnDF.select(min(" WITHDRAWAL AMT ").alias("Min Withdrawal Amount")).show()

+---------------------+
|Min Withdrawal Amount|
+---------------------+
|                 0.01|
+---------------------+



In [56]:
# Maximum Deposit Amount of an Account
txnDF.select(max(" DEPOSIT AMT ").alias("Max Deposit Amt").cast("long")).show()

+---------------+
|Max Deposit Amt|
+---------------+
|      544800000|
+---------------+



In [58]:
# Minimum Deposit Amount of an Account
txnDF.select(min(" DEPOSIT AMT ").alias("Min Deposit Amt")).show()

+---------------+
|Min Deposit Amt|
+---------------+
|           0.01|
+---------------+



In [71]:
# Sum of balance in every bank account
from pyspark.sql.functions import sum
txnDF.groupBy("Account No").agg(sum("BALANCE AMT").cast("long").alias("Total Balance")).show()

+-------------+---------------+
|   Account No|  Total Balance|
+-------------+---------------+
|409000438611'| -2494865770683|
|     1196711'|-16047649810127|
|     1196428'|-81418498130721|
|409000493210'| -3275849521320|
|409000611074'|     1615533622|
|409000425051'|    -3772118411|
|409000405747'|   -24310804706|
|409000362497'|-52860004792808|
|409000493201'|     1042083182|
|409000438620'| -7122918679513|
+-------------+---------------+



In [79]:
# Number of transaction on each date
from pyspark.sql.functions import count
txnDF.groupBy("VALUE DATE").count().withColumnRenamed("count", "Number Of Transactions").show()

+----------+----------------------+
|VALUE DATE|Number Of Transactions|
+----------+----------------------+
| 23-Dec-16|                   143|
|  7-Feb-19|                    98|
| 21-Jul-15|                    80|
|  9-Sep-15|                    91|
| 17-Jan-15|                    16|
| 18-Nov-17|                    53|
| 21-Feb-18|                    77|
| 20-Mar-18|                    71|
| 19-Apr-18|                    71|
| 21-Jun-16|                    97|
| 17-Oct-17|                   101|
|  3-Jan-18|                    70|
|  8-Jun-18|                   223|
| 15-Dec-18|                    62|
|  8-Aug-16|                    97|
| 17-Dec-16|                    74|
|  3-Sep-15|                    83|
| 21-Jan-16|                    76|
|  4-May-18|                    92|
|  7-Sep-17|                    94|
+----------+----------------------+
only showing top 20 rows



In [83]:
# List of customers with withdrawal amount more than 1 lakh
txnDF.filter(txnDF[" WITHDRAWAL AMT "] > 100000).select("Account No").show()

+-------------+
|   Account No|
+-------------+
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
+-------------+
only showing top 20 rows

