In [1]:
sc

In [2]:
spark

In [3]:
bank_customer_data = spark.read.json("file:///home/hadoop/Downloads/bank_edited.json", multiLine= True)

In [4]:
bank_customer_data.show()

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan| marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no| married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no|  single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes| married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no| married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown| 

In [5]:
bank_customer_data.printSchema()

root
 |-- age: long (nullable = true)
 |-- balance: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: long (nullable = true)
 |-- default: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- education: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- job: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- month: string (nullable = true)
 |-- pdays: long (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- previous: long (nullable = true)
 |-- y: string (nullable = true)



#### 1.Display Max,Min and Mean Age of the targeted customer.

In [6]:
bank_customer_data.createOrReplaceTempView("banktable")

In [7]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [8]:
bank_customer_data.select(max("age")).show()

+--------+
|max(age)|
+--------+
|      95|
+--------+



In [9]:
bank_customer_data.select(min("age")).show()

+--------+
|min(age)|
+--------+
|      18|
+--------+



In [10]:
bank_customer_data.select(round(mean("age"))).show()

+------------------+
|round(avg(age), 0)|
+------------------+
|              41.0|
+------------------+



In [11]:
#SparkSql - alternative method

spark.sql("select max(age), min(age), mean(age) from banktable").show()

+--------+--------+-----------------+
|max(age)|min(age)|         avg(age)|
+--------+--------+-----------------+
|      95|      18|40.93621021432837|
+--------+--------+-----------------+



#### 2.Check the quality of Customers by analyzing targeted customers min,max mean, median balance

In [18]:
spark.sql("select max(balance) as Max, min(abs(balance)) as Min, mean(balance) as Mean, percentile_approx(balance,0.5) as median from banktable").show()

+------+---+------------------+------+
|   Max|Min|              Mean|median|
+------+---+------------------+------+
|102127|  0|1362.2720576850766|   448|
+------+---+------------------+------+



#### 3.Check if age matters in marketing subscription for Term Deposit Scheme

In [20]:
spark.sql("select age, count(*) as customer_count from banktable where y = 'yes' \
           group by age order by customer_count desc").show()

+---+--------------+
|age|customer_count|
+---+--------------+
| 32|           221|
| 30|           217|
| 33|           210|
| 35|           209|
| 31|           206|
| 34|           198|
| 36|           195|
| 29|           171|
| 37|           170|
| 28|           162|
| 38|           144|
| 39|           143|
| 27|           141|
| 26|           134|
| 41|           120|
| 46|           118|
| 40|           116|
| 25|           113|
| 47|           113|
| 42|           111|
+---+--------------+
only showing top 20 rows



#### 4.Calculate Marketing Success Rate

In [44]:
spark.sql("select round((select count(*) from banktable where y = 'yes')*100/count(*),2) as success_rate from banktable").show()

+------------+
|success_rate|
+------------+
|        11.7|
+------------+



In [48]:
spark.sql("select(select count(*) from banktable where y = 'no')*100/count(*) as failure_rate from banktable").show()

+-----------------+
|     failure_rate|
+-----------------+
|88.30151954170445|
+-----------------+



#### 6.Check if marital status matters in marketing subscription for term deposit schema

In [84]:
spark.sql("select marital, count(*) as Marital_Count from banktable where y='yes' group by marital order by Marital_Count").show()

+--------+-------------+
| marital|Marital_Count|
+--------+-------------+
|divorced|          622|
|  single|         1912|
| married|         2755|
+--------+-------------+



### 8. Check if age and marital status together mattered for subsciption

In [91]:
spark.sql("select age,marital, count(*) as success from banktable where y = 'yes' group by age,marital order by success desc").show()

+---+-------+-------+
|age|marital|success|
+---+-------+-------+
| 30| single|    151|
| 28| single|    138|
| 29| single|    133|
| 32| single|    124|
| 26| single|    121|
| 34|married|    118|
| 31| single|    111|
| 27| single|    110|
| 35|married|    101|
| 36|married|    100|
| 25| single|     99|
| 37|married|     98|
| 33|married|     97|
| 33| single|     97|
| 39|married|     87|
| 32|married|     87|
| 38|married|     86|
| 35| single|     84|
| 47|married|     83|
| 46|married|     80|
+---+-------+-------+
only showing top 20 rows



### Compute Success Rate of each age and marital status

In [87]:
spark.sql("select age,marital,count(case when y='yes' Then 1 else null end)*100/count(*) as Total_Success_Rate from banktable group by age,marital order by Total_Success_Rate desc").show()

+---+--------+------------------+
|age| marital|Total_Success_Rate|
+---+--------+------------------+
| 93| married|             100.0|
| 68|divorced|             100.0|
| 95|divorced|             100.0|
| 92| married|             100.0|
| 90|divorced|             100.0|
| 85|divorced|             100.0|
| 87|divorced|             100.0|
| 86|  single|             100.0|
| 67|divorced|              87.5|
| 62|divorced| 83.33333333333333|
| 85| married|              75.0|
| 76|divorced|              75.0|
| 71|divorced| 72.72727272727273|
| 87| married| 66.66666666666667|
| 84| married| 66.66666666666667|
| 73|divorced| 66.66666666666667|
| 77|divorced|              60.0|
| 18|  single|58.333333333333336|
| 63|divorced|57.142857142857146|
| 73| married| 52.77777777777778|
+---+--------+------------------+
only showing top 20 rows



In [97]:
spark.sql("""
select age,marital, count(*) as totalCount,
SUM(case when y = 'yes' then 1 else 0 end) as success_count,
(SUM(case when y = 'yes' then 1 else 0 end)*100/count(*)) as success_percentage
from banktable group by age,marital order by success_percentage desc""").show()

+---+--------+----------+-------------+------------------+
|age| marital|totalCount|success_count|success_percentage|
+---+--------+----------+-------------+------------------+
| 85|divorced|         1|            1|             100.0|
| 92| married|         2|            2|             100.0|
| 68|divorced|         6|            6|             100.0|
| 90|divorced|         2|            2|             100.0|
| 93| married|         2|            2|             100.0|
| 95|divorced|         1|            1|             100.0|
| 87|divorced|         1|            1|             100.0|
| 86|  single|         1|            1|             100.0|
| 67|divorced|         8|            7|              87.5|
| 62|divorced|         6|            5| 83.33333333333333|
| 76|divorced|         8|            6|              75.0|
| 85| married|         4|            3|              75.0|
| 71|divorced|        11|            8| 72.72727272727273|
| 84| married|         6|            4| 66.6666666666666

### 10.Do features engineering for bank inverstment scheme and find effect of age on the campaign

In [104]:
spark.sql("""
select age_category, count(*) as subscriber_count from
(select case when age < 25 then "Teenager"
            when age >=25 and age<=35 then "Adult"
            when age>35 and age<=55 then "Middle Aged"
            else 'old' 
        end as age_category
from banktable where y = 'yes') group by age_category order by subscriber_count desc
""").show()

+------------+----------------+
|age_category|subscriber_count|
+------------+----------------+
| Middle Aged|            2194|
|       Adult|            1982|
|         old|             906|
|    Teenager|             207|
+------------+----------------+



### Alternative method by creating UDF

In [105]:
from pyspark.sql.functions import udf

In [106]:
age_range = udf(lambda age: "Teenager" if age<25 else 
                             "Adult" if(age >=25 and age<=35) else
                             "Middle Aged" if (age>=35 and age<55) else
                             "old"
               )

In [107]:
bank_customer_df = bank_customer_data.withColumn('age_catergory',age_range(bank_customer_data.age))

In [108]:
bank_customer_df.show()

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+-------------+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan| marital|month|pdays|poutcome|previous|  y|age_catergory|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+-------------+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no| married|  may|   -1| unknown|       0| no|          old|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no|  single|  may|   -1| unknown|       0| no|  Middle Aged|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes| married|  may|   -1| unknown|       0| no|        Adult|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no| married|  may|   -1| unknown|       0| no|

In [110]:
bank_customer_df.createOrReplaceTempView('newBankTable')

In [115]:
spark.sql("select age_catergory, count(*) as success_count from newBankTable group by age_catergory order by success_count desc").show()

+-------------+-------------+
|age_catergory|success_count|
+-------------+-------------+
|  Middle Aged|        22598|
|        Adult|        16098|
|          old|         5706|
|     Teenager|          809|
+-------------+-------------+



### Write a query to show distributed probability rate for each age category