In [0]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-63da7bcc-796c-4df1-80b4-4c24e966df28/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
bank_marketing = fetch_ucirepo(id=222) 
  
# data (as pandas dataframes) 
X = bank_marketing.data.features 
y = bank_marketing.data.targets 

In [0]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from  pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import pandas as pd

In [0]:
spark = SparkSession.Builder().appName('app').getOrCreate()

In [0]:
data = pd.concat((X,y),axis=1)

df = spark.createDataFrame(data)

df.show(5)

+---+------------+-------+---------+-------+-------+-------+----+-------+-----------+-----+--------+--------+-----+--------+--------+---+
|age|         job|marital|education|default|balance|housing|loan|contact|day_of_week|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+-------+---------+-------+-------+-------+----+-------+-----------+-----+--------+--------+-----+--------+--------+---+
| 58|  management|married| tertiary|     no|   2143|    yes|  no|   null|          5|  may|     261|       1|   -1|       0|    null| no|
| 44|  technician| single|secondary|     no|     29|    yes|  no|   null|          5|  may|     151|       1|   -1|       0|    null| no|
| 33|entrepreneur|married|secondary|     no|      2|    yes| yes|   null|          5|  may|      76|       1|   -1|       0|    null| no|
| 47| blue-collar|married|     null|     no|   1506|    yes|  no|   null|          5|  may|      92|       1|   -1|       0|    null| no|
| 33|        null| single|     nul

In [0]:
for feature in df.columns:
    print(feature,df.where(df[feature].isNull()).count()/df.count() * 100)

age 0.0
job 0.6370131162770122
marital 0.0
education 4.1074074893278185
default 0.0
balance 0.0
housing 0.0
loan 0.0
contact 28.798301298356595
day_of_week 0.0
month 0.0
duration 0.0
campaign 0.0
pdays 0.0
previous 0.0
poutcome 81.74780473778506
y 0.0


In [0]:
feature_to_drop = ['contact','poutcome','month']
df = df.drop(*feature_to_drop)

In [0]:
df.show(5)

+---+------------+-------+---------+-------+-------+-------+----+-----------+-----+--------+--------+-----+--------+---+
|age|         job|marital|education|default|balance|housing|loan|day_of_week|month|duration|campaign|pdays|previous|  y|
+---+------------+-------+---------+-------+-------+-------+----+-----------+-----+--------+--------+-----+--------+---+
| 58|  management|married| tertiary|     no|   2143|    yes|  no|          5|  may|     261|       1|   -1|       0| no|
| 44|  technician| single|secondary|     no|     29|    yes|  no|          5|  may|     151|       1|   -1|       0| no|
| 33|entrepreneur|married|secondary|     no|      2|    yes| yes|          5|  may|      76|       1|   -1|       0| no|
| 47| blue-collar|married|     null|     no|   1506|    yes|  no|          5|  may|      92|       1|   -1|       0| no|
| 33|        null| single|     null|     no|      1|     no|  no|          5|  may|     198|       1|   -1|       0| no|
+---+------------+-------+------

In [0]:
job_mode_value = df.groupBy('job').count().orderBy('count',ascending=False).first()[0]
education_mode_value = df.groupBy('education').count().orderBy('count',ascending=False).first()[0]

In [0]:
df = df.fillna(job_mode_value, subset=['job'])
df = df.fillna(education_mode_value, subset=['education'])

In [0]:
for feature in df.columns:
    print(feature,df.where(df[feature].isNull()).count()/df.count() * 100)

age 0.0
job 0.0
marital 0.0
education 0.0
default 0.0
balance 0.0
housing 0.0
loan 0.0
day_of_week 0.0
month 0.0
duration 0.0
campaign 0.0
pdays 0.0
previous 0.0
y 0.0


In [0]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- job: string (nullable = false)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = false)
 |-- default: string (nullable = true)
 |-- balance: long (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- day_of_week: long (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- pdays: long (nullable = true)
 |-- previous: long (nullable = true)
 |-- y: string (nullable = true)



In [0]:
categorical_features = list()
for value in df.dtypes:
    if value[1] == 'string':
        categorical_features.append(value[0])
        print(value[0])
        df.groupBy(value[0]).count().orderBy('count',ascending=False).show()
        print()

job
+-------------+-----+
|          job|count|
+-------------+-----+
|  blue-collar|10020|
|   management| 9458|
|   technician| 7597|
|       admin.| 5171|
|     services| 4154|
|      retired| 2264|
|self-employed| 1579|
| entrepreneur| 1487|
|   unemployed| 1303|
|    housemaid| 1240|
|      student|  938|
+-------------+-----+


marital
+--------+-----+
| marital|count|
+--------+-----+
| married|27214|
|  single|12790|
|divorced| 5207|
+--------+-----+


education
+---------+-----+
|education|count|
+---------+-----+
|secondary|25059|
| tertiary|13301|
|  primary| 6851|
+---------+-----+


default
+-------+-----+
|default|count|
+-------+-----+
|     no|44396|
|    yes|  815|
+-------+-----+


housing
+-------+-----+
|housing|count|
+-------+-----+
|    yes|25130|
|     no|20081|
+-------+-----+


loan
+----+-----+
|loan|count|
+----+-----+
|  no|37967|
| yes| 7244|
+----+-----+


y
+---+-----+
|  y|count|
+---+-----+
| no|39922|
|yes| 5289|
+---+-----+




In [0]:
label_encoder = F.udf(lambda value: 1 if value == 'yes' else 0)
features = ['default','housing','loan','y']

for feature in features:
    df = df.withColumn(f'{feature}_encoded',label_encoder(F.col(feature)))

In [0]:
df.show(5)

+---+------------+-------+---------+-------+-------+-------+----+-----------+--------+--------+-----+--------+---+---------------+---------------+------------+---------+
|age|         job|marital|education|default|balance|housing|loan|day_of_week|duration|campaign|pdays|previous|  y|default_encoded|housing_encoded|loan_encoded|y_encoded|
+---+------------+-------+---------+-------+-------+-------+----+-----------+--------+--------+-----+--------+---+---------------+---------------+------------+---------+
| 58|  management|married| tertiary|     no|   2143|    yes|  no|          5|     261|       1|   -1|       0| no|              0|              1|           0|        0|
| 44|  technician| single|secondary|     no|     29|    yes|  no|          5|     151|       1|   -1|       0| no|              0|              1|           0|        0|
| 33|entrepreneur|married|secondary|     no|      2|    yes| yes|          5|      76|       1|   -1|       0| no|              0|              1|    

In [0]:
df = df.withColumn('y', F.col('y_encoded').cast(IntegerType()))\
  .withColumn('loan', F.col('loan_encoded').cast(IntegerType()))\
  .withColumn('housing', F.col('housing_encoded').cast(IntegerType()))\
  .withColumn('default', F.col('default_encoded').cast(IntegerType()))

In [0]:
df = df.drop(*['y_encoded','loan_encoded','housing_encoded','default_encoded'])

In [0]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- job: string (nullable = false)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = false)
 |-- default: integer (nullable = true)
 |-- balance: long (nullable = true)
 |-- housing: integer (nullable = true)
 |-- loan: integer (nullable = true)
 |-- day_of_week: long (nullable = true)
 |-- duration: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- pdays: long (nullable = true)
 |-- previous: long (nullable = true)
 |-- y: integer (nullable = true)



In [0]:
df.show(5)

+---+------------+-------+---------+-------+-------+-------+----+-----------+--------+--------+-----+--------+---+
|age|         job|marital|education|default|balance|housing|loan|day_of_week|duration|campaign|pdays|previous|  y|
+---+------------+-------+---------+-------+-------+-------+----+-----------+--------+--------+-----+--------+---+
| 58|  management|married| tertiary|      0|   2143|      1|   0|          5|     261|       1|   -1|       0|  0|
| 44|  technician| single|secondary|      0|     29|      1|   0|          5|     151|       1|   -1|       0|  0|
| 33|entrepreneur|married|secondary|      0|      2|      1|   1|          5|      76|       1|   -1|       0|  0|
| 47| blue-collar|married|secondary|      0|   1506|      1|   0|          5|      92|       1|   -1|       0|  0|
| 33| blue-collar| single|secondary|      0|      1|      0|   0|          5|     198|       1|   -1|       0|  0|
+---+------------+-------+---------+-------+-------+-------+----+-----------+---

In [0]:
job_string_indexer = StringIndexer(inputCol='job',outputCol='job_labeled')
marital_string_indexer = StringIndexer(inputCol='marital',outputCol='marital_labeled')
education_string_indexer = StringIndexer(inputCol='education',outputCol='education_labeled')

job_ohe_indexer = OneHotEncoder(inputCol='job_labeled',outputCol='job_vec')
marital_ohe_indexer = OneHotEncoder(inputCol='marital_labeled',outputCol='marital_vec')
education_ohe_indexer = OneHotEncoder(inputCol='education_labeled',outputCol='education_vec')

In [0]:
train,test = df.randomSplit([0.7,0.3])

In [0]:
assembler = VectorAssembler(inputCols=['age','job_vec','marital_vec','education_vec','default','balance','housing',
 'loan','day_of_week','duration','campaign','pdays','previous','y'],outputCol='features')

min_max_scaler = MinMaxScaler(inputCol='features',outputCol='scaled_features')

lr_reg = LogisticRegression(featuresCol='scaled_features',labelCol='y')

lr_reg_pipe = Pipeline(stages=[
    job_string_indexer,
    job_ohe_indexer,
    marital_string_indexer,
    marital_ohe_indexer,
    education_string_indexer,
    education_ohe_indexer,
    assembler,
    min_max_scaler,
    lr_reg
])

lr_reg_pipe = lr_reg_pipe.fit(train)
test_pred = lr_reg_pipe.transform(test)

In [0]:
test_pred.select(['y','rawPrediction','probability','prediction']).show(5)

+---+--------------------+--------------------+----------+
|  y|       rawPrediction|         probability|prediction|
+---+--------------------+--------------------+----------+
|  0|[19.0327466509890...|[0.99999999457770...|       0.0|
|  0|[19.2635535579115...|[0.99999999569527...|       0.0|
|  0|[19.3606606037575...|[0.99999999609364...|       0.0|
|  0|[19.8539024189177...|[0.99999999761460...|       0.0|
|  0|[19.6660646295139...|[0.99999999712169...|       0.0|
+---+--------------------+--------------------+----------+
only showing top 5 rows



In [0]:
f1_eval = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',labelCol='y',metricName='f1')
roc_eval = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',labelCol='y',metricName='areaUnderROC')

In [0]:
eval_data = test_pred.select(['y','rawPrediction','probability','prediction'])
print(f'Roc-auc : {roc_eval.evaluate(eval_data)}')

Roc-auc : 0.9999988820984353
