In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[2]") \
    .appName("Load Data From HDFS") \
    .getOrCreate()

In [2]:
spark

In [3]:
# load hadoop format file parquet from hdfs

bank_df = spark.read.parquet("hdfs://localhost:9820/in_proyek2/bank-additional.parq")
#bank_df = spark.read.parquet("bank-additional.parq")
bank_df.show(5)
bank_df.printSchema()

+---+---------+-------+-----------+-------+----+---------+-----+-----------+--------+--------+--------+-----------+---+
|age|      job|marital|  education|housing|loan|  contact|month|day_of_week|duration|campaign|previous|   poutcome|  y|
+---+---------+-------+-----------+-------+----+---------+-----+-----------+--------+--------+--------+-----------+---+
| 56|housemaid|married|   basic.4y|     no|  no|telephone|  may|        mon|     261|       1|       0|nonexistent| no|
| 57| services|married|high.school|     no|  no|telephone|  may|        mon|     149|       1|       0|nonexistent| no|
| 37| services|married|high.school|    yes|  no|telephone|  may|        mon|     226|       1|       0|nonexistent| no|
| 40|   admin.|married|   basic.6y|     no|  no|telephone|  may|        mon|     151|       1|       0|nonexistent| no|
| 56| services|married|high.school|     no| yes|telephone|  may|        mon|     307|       1|       0|nonexistent| no|
+---+---------+-------+-----------+-----

### Importing needful libraries

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean, col, split, col, regexp_extract, when, lit
from pyspark.ml.feature import StringIndexer, IndexToString, VectorAssembler, VectorIndexer
from pyspark.ml.feature import QuantileDiscretizer, OneHotEncoder

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [5]:
# Number of customers in the dataframe
clients_count = bank_df.count()
print("Number of customers is {}".format(clients_count))

Number of customers is 41166


In [6]:
# Number of customers which are subscribed vs. those not subscribed a term deposit
groupBy_clients = bank_df.groupBy("y").count()
groupBy_clients.show()

+---+-----+
|  y|count|
+---+-----+
| no|36526|
|yes| 4640|
+---+-----+



In [7]:
bank_df.describe([t[0] for t in bank_df.dtypes if t[1] == 'int']).show()

+-------+------------------+-----------------+-----------------+-------------------+
|summary|               age|         duration|         campaign|           previous|
+-------+------------------+-----------------+-----------------+-------------------+
|  count|             41166|            41166|            41166|              41166|
|   mean| 40.02356313462566|258.3929456347471| 2.54875382597289|0.17305543409609872|
| stddev|10.421897109457703|259.3029641183626|2.645304359228272|0.49501715074689073|
|    min|                17|                0|                1|                  0|
|    max|                98|             4918|               32|                  7|
+-------+------------------+-----------------+-----------------+-------------------+



### Preparing Data for Machine Learning

In [8]:
def get_dummy(df, categoricalCols, continuousCols, labelCol):
    indexers = [StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categoricalCols]
    encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers]
    
    assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders] + continuousCols, outputCol="features")
    indexer = StringIndexer(inputCol=labelCol, outputCol='indexedLabel')
    
    pipeline = Pipeline(stages = indexers + encoders + [assembler] + [indexer])
    model=pipeline.fit(df)
    data = model.transform(df)
    
    data = data.withColumn('label', col(labelCol))
    
    return data.select('features', 'indexedLabel', 'label'), StringIndexer(inputCol='label').fit(data)

In [9]:
# transform the data
categoricalColumns = ['job', 'marital', 'education', 'housing', 'loan', 'month', 'day_of_week', 'poutcome']
numericCols = ['age', 'duration', 'campaign', 'previous']
(bank_df, labelindexer) = get_dummy(bank_df, categoricalColumns, numericCols, 'y')
bank_df.show(5)

+--------------------+------------+-----+
|            features|indexedLabel|label|
+--------------------+------------+-----+
|(42,[8,11,18,22,2...|         0.0|   no|
|(42,[3,11,15,22,2...|         0.0|   no|
|(42,[3,11,15,21,2...|         0.0|   no|
|(42,[0,11,19,22,2...|         0.0|   no|
|(42,[3,11,15,23,3...|         0.0|   no|
+--------------------+------------+-----+
only showing top 5 rows



In [10]:
# fit the following featureIndexer model on the whole of the bank_df dataframe
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(bank_df)

featureIndexer.transform(bank_df).show(5)

+--------------------+------------+-----+--------------------+
|            features|indexedLabel|label|     indexedFeatures|
+--------------------+------------+-----+--------------------+
|(42,[8,11,18,22,2...|         0.0|   no|(42,[8,11,18,22,2...|
|(42,[3,11,15,22,2...|         0.0|   no|(42,[3,11,15,22,2...|
|(42,[3,11,15,21,2...|         0.0|   no|(42,[3,11,15,21,2...|
|(42,[0,11,19,22,2...|         0.0|   no|(42,[0,11,19,22,2...|
|(42,[3,11,15,23,3...|         0.0|   no|(42,[3,11,15,23,3...|
+--------------------+------------+-----+--------------------+
only showing top 5 rows



In [11]:
bank_df.show(5, False)

+---------------------------------------------------------------------------------------+------------+-----+
|features                                                                               |indexedLabel|label|
+---------------------------------------------------------------------------------------+------------+-----+
|(42,[8,11,18,22,23,33,36,38,39,40],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,56.0,261.0,1.0])       |0.0         |no   |
|(42,[3,11,15,22,23,33,36,38,39,40],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,57.0,149.0,1.0])       |0.0         |no   |
|(42,[3,11,15,21,22,23,33,36,38,39,40],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,37.0,226.0,1.0])|0.0         |no   |
|(42,[0,11,19,22,23,33,36,38,39,40],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,40.0,151.0,1.0])       |0.0         |no   |
|(42,[3,11,15,23,33,36,38,39,40],[1.0,1.0,1.0,1.0,1.0,1.0,56.0,307.0,1.0])              |0.0         |no   |
+---------------------------------------------------------------------------------------+------------+-----+
only showing top 5 

### Data Splitting

In [12]:
(trainingData, testData) = bank_df.randomSplit([0.7, 0.3], seed=202105)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 28793
Test Dataset Count: 12373


In [13]:
print("The first 5 samples of the Training Dataset:")
trainingData.show(5, False)
print("The first 5 samples of the Test Dataset:")
testData.show(5, False)

The first 5 samples of the Training Dataset:
+---------------------------------------------------------------------------------------+------------+-----+
|features                                                                               |indexedLabel|label|
+---------------------------------------------------------------------------------------+------------+-----+
|(42,[0,11,14,21,22,23,32,36,38,39,40],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,25.0,251.0,4.0])|0.0         |no   |
|(42,[0,11,14,21,22,23,32,36,38,39,40],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,29.0,203.0,1.0])|0.0         |no   |
|(42,[0,11,14,21,22,23,32,36,38,39,40],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,29.0,302.0,1.0])|0.0         |no   |
|(42,[0,11,14,21,22,23,32,36,38,39,40],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,30.0,196.0,2.0])|0.0         |no   |
|(42,[0,11,14,21,22,23,32,36,38,39,40],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,31.0,102.0,4.0])|0.0         |no   |
+----------------------------------------------------------------------------------

## Fit Models

### Logistic Regression

In [14]:
#lr = LogisticRegression(labelCol="indexedLabel", featuresCol="indexedFeatures") # using this line if you would using indexedFeatures instead features column
lr = LogisticRegression(labelCol="indexedLabel", featuresCol="features")

#### Pipeline Architecture

In [15]:
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelindexer.labels) 

pipeline = Pipeline(stages=[featureIndexer, lr, labelConverter])

lrModel = pipeline.fit(trainingData)

#### Predictions

In [16]:
# Make predictions on the test data using the transform() method.
predictions = lrModel.transform(testData)
predictions.show(5)

+--------------------+------------+-----+--------------------+--------------------+--------------------+----------+--------------+
|            features|indexedLabel|label|     indexedFeatures|       rawPrediction|         probability|prediction|predictedLabel|
+--------------------+------------+-----+--------------------+--------------------+--------------------+----------+--------------+
|(42,[0,11,14,21,2...|         1.0|  yes|(42,[0,11,14,21,2...|[0.33983678077516...|[0.58415087448952...|       0.0|            no|
|(42,[0,11,14,21,2...|         0.0|   no|(42,[0,11,14,21,2...|[3.85697210202566...|[0.97930542743745...|       0.0|            no|
|(42,[0,11,14,21,2...|         0.0|   no|(42,[0,11,14,21,2...|[3.95354789840121...|[0.98117468302368...|       0.0|            no|
|(42,[0,11,14,21,2...|         0.0|   no|(42,[0,11,14,21,2...|[3.91146234412612...|[0.98038137615657...|       0.0|            no|
|(42,[0,11,14,21,2...|         0.0|   no|(42,[0,11,14,21,2...|[3.65748506266817...|

In [17]:
predictions.select("features", "label", "probability", "predictedLabel").show(5)

+--------------------+-----+--------------------+--------------+
|            features|label|         probability|predictedLabel|
+--------------------+-----+--------------------+--------------+
|(42,[0,11,14,21,2...|  yes|[0.58415087448952...|            no|
|(42,[0,11,14,21,2...|   no|[0.97930542743745...|            no|
|(42,[0,11,14,21,2...|   no|[0.98117468302368...|            no|
|(42,[0,11,14,21,2...|   no|[0.98038137615657...|            no|
|(42,[0,11,14,21,2...|   no|[0.97485145497825...|            no|
+--------------------+-----+--------------------+--------------+
only showing top 5 rows



#### Compute the model accuracy

In [18]:
cm = predictions.select("label", "predictedLabel")          
cm.groupby('label').agg({'label': 'count'}).show()  
cm.groupby('predictedLabel').agg({'predictedLabel': 'count'}).show()

+-----+------------+
|label|count(label)|
+-----+------------+
|   no|       10990|
|  yes|        1383|
+-----+------------+

+--------------+---------------------+
|predictedLabel|count(predictedLabel)|
+--------------+---------------------+
|            no|                11631|
|           yes|                  742|
+--------------+---------------------+



In [19]:
predictions.groupBy('label', 'predictedLabel').count().show()

+-----+--------------+-----+
|label|predictedLabel|count|
+-----+--------------+-----+
|   no|            no|10723|
|   no|           yes|  267|
|  yes|           yes|  475|
|  yes|            no|  908|
+-----+--------------+-----+



In [20]:
print("The Accuracy for test set is {}".format(cm.filter(cm.label == cm.predictedLabel).count()/cm.count()))

The Accuracy for test set is 0.9050351571971228


In [21]:
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
print("The Accuracy for test set is {}".format(evaluator.evaluate(predictions)))

The Accuracy for test set is 0.9050351571971228
