# Data Segregation

## Load Parquet Hadoop File Format from HDFS

In [26]:
# load parquet hadoop file format from hdfs

df = spark.read.parquet("hdfs://localhost:9820/in_proyek2/bank-additional.parq")
#df = spark.read.parquet("bank-additional.parq")
df.show(5)
df.printSchema()

+---+---------+-------+-----------+-------+----+---------+-----+-----------+--------+--------+--------+-----------+---+
|age|      job|marital|  education|housing|loan|  contact|month|day_of_week|duration|campaign|previous|   poutcome|  y|
+---+---------+-------+-----------+-------+----+---------+-----+-----------+--------+--------+--------+-----------+---+
| 56|housemaid|married|   basic.4y|     no|  no|telephone|  may|        mon|     261|       1|       0|nonexistent| no|
| 57| services|married|high.school|     no|  no|telephone|  may|        mon|     149|       1|       0|nonexistent| no|
| 37| services|married|high.school|    yes|  no|telephone|  may|        mon|     226|       1|       0|nonexistent| no|
| 40|   admin.|married|   basic.6y|     no|  no|telephone|  may|        mon|     151|       1|       0|nonexistent| no|
| 56| services|married|high.school|     no| yes|telephone|  may|        mon|     307|       1|       0|nonexistent| no|
+---+---------+-------+-----------+-----

In [27]:
df = df.select('age', 'job', 'marital', 'education', 'housing', 'loan', 'month', 'day_of_week', 'duration', 'campaign', 'previous', 'poutcome', 'y')
cols = df.columns
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [28]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean, col, split, col, regexp_extract, when, lit
from pyspark.ml.feature import StringIndexer, IndexToString, VectorAssembler, VectorIndexer
from pyspark.ml.feature import QuantileDiscretizer, OneHotEncoder, OneHotEncoder

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [29]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



## Data splitting

In [30]:
(trainingData, testData) = df.randomSplit([0.7, 0.3], seed=10)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 28721
Test Dataset Count: 12445


In [31]:
print("The first 5 samples of the Training Dataset:")
trainingData.show(5, False)
print("The first 5 samples of the Test Dataset:")
testData.show(5, False)

The first 5 samples of the Training Dataset:
+---+-------+-------+-----------+-------+----+-----+-----------+--------+--------+--------+-----------+---+
|age|job    |marital|education  |housing|loan|month|day_of_week|duration|campaign|previous|poutcome   |y  |
+---+-------+-------+-----------+-------+----+-----+-----------+--------+--------+--------+-----------+---+
|17 |student|single |basic.9y   |yes    |no  |aug  |fri        |92      |3       |2       |success    |no |
|17 |student|single |basic.9y   |yes    |yes |aug  |fri        |498     |2       |1       |failure    |yes|
|17 |student|single |unknown    |yes    |no  |aug  |wed        |432     |3       |2       |success    |no |
|18 |student|single |basic.4y   |no     |no  |apr  |thu        |108     |1       |0       |nonexistent|no |
|18 |student|single |high.school|no     |no  |may  |fri        |271     |1       |1       |failure    |yes|
+---+-------+-------+-----------+-------+----+-----+-----------+--------+--------+--------+

# Model Training Logistic Regression

In [None]:
def get_dummy(df, categoricalCols, continuousCols, labelCol):
  
  indexers = [StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categoricalCols]

  encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(),
                             outputCol="{0}_encoded".format(indexer.getOutputCol()))
              for indexer in indexers]

  assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                              + continuousCols, outputCol="features")
  
  indexer = StringIndexer(inputCol=labelCol, outputCol='indexedLabel')

  pipeline = Pipeline(stages = indexers + encoders + [assembler] + [indexer])

  model=pipeline.fit(df)
  data = model.transform(df)

  data = data.withColumn('label', col(labelCol))
  
  return data.select('features', 'indexedLabel', 'label'), StringIndexer(inputCol='label').fit(data)

In [9]:
#lr = LogisticRegression(labelCol="indexedLabel", featuresCol="indexedFeatures") # using this line if you would using indexedFeatures instead features column
lr = LogisticRegression(labelCol="indexedLabel", featuresCol="features")

In [None]:
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelindexer.labels) 

pipeline = Pipeline(stages=[featureIndexer, lr, labelConverter])

lrModel = pipeline.fit(trainingData)

In [None]:
predictions = lrModel.transform(testData)

predictions.show(5)

In [None]:
predictions.select("features", "label", "probability", "predictedLabel").show(5)