# Data Analysis using Pyspark

* NAB Customer Remediation Events
As a part of customer remediation event, once the impacted population is determined we need extensive data analysis to be done to understand the sizing, financial impact and total refund estimation
Data analysis on the impacted population is done using Pyspark transformations in terms of Filtering, Aggregation and sorting


In [1]:
# starting spark context
from pyspark.sql import SparkSession

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    appName('remediation_482'). \
    master('yarn'). \
    getOrCreate()

event482_path = "/file path for impacted population"

population = spark.read. parquet(///event482_path)

In [None]:
# number of customers impacted due to deleted next repayments but not delayed repayments
population. filter("IsRepayDeleted = 'YES' AND IsRepayDelayed = 'NO'"). count()

In [None]:
from pyspark.sql.functions import col

In [None]:
population. filter((col("IsRepayDeleted") == "YES") &  (col("IsRepayDelayed") == "NO")). count()

In [None]:
from pyspark.sql.functions import col, concat, lpad, lit, count, sum, expr

# number of impacts for which repaymnets delayed for more than 2 months
population. filter(col("diff_payment_date") > 70).  count()
# To identify if impacted population has customers in COVID repayment pause or any hardships
population. filter("facility_action IN ('PAU', 'HRD', 'FIN', 'MAN', 'COV')"). count()
# To identify if impacted population with customers in COVID repayment pause or any hardships and whose payments have been deleted
population. withColumn("COVID_PaymentDelayed",concat(col("Year"), lpad(col("Month"), 2, "0")). filter("IsRepayDelayed = 'YES' AND PaymentDate LIKE '200801%'"). count()

# Aggregating both delayed and deleted repayments

population. \
    agg(count(lit(1)).alias("EventCount"),
        sum(expr("CASE WHEN IsRepayDeleted = 'YES' THEN 1 ELSE 0 END")).alias("Count_Repay_deleted"),
        sum(expr("CASE WHEN IsRepayDelayed = 'YES' THEN 1 ELSE 0 END")).alias("Count_Repay_delayed")
       ). \
    show()                   

In [None]:
##Grouping by run date 

from pyspark.sql.functions import lit, concat, lpad
population. \
  groupBy(concat("Audit_Year", lit("-"), 
                 lpad("Audit_Month", 2, "0"), lit("-"), 
                 lpad("Audit_day", 2, "0")).
          alias("AuditDate")). \
    agg(count(lit(1)).alias("Impacts_Count"),
        sum(expr("CASE WHEN IsRepayDeleted = 'YES' THEN 1 ELSE 0 END")).alias("Count_Repay_deleted"),
        sum(expr("CASE WHEN IsRepayDelayed = 'YES' THEN 1 ELSE 0 END")).alias("Count_Repay_delayed")
       ). \
    orderBy(col("AuditDate").desc()). \
    show()