In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[2]") \
    .appName("Load Data") \
    .getOrCreate()

In [2]:
spark

### Convert CSV File into Parquet Hadoop File Format

In [3]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

# read csv from hdfs with specifying schema for all columns in data
schema = StructType([
         StructField("age", IntegerType(), True),
         StructField("job", StringType(), True),
         StructField("marital", StringType(), True),
         StructField("education", StringType(), True),
         StructField("housing", StringType(), True),
         StructField("loan", StringType(), True),
         StructField("contact", StringType(), True),
         StructField("month", StringType(), True),
         StructField("day_of_week", StringType(), True),
         StructField("duration", IntegerType(), True),
         StructField("campaign", IntegerType(), True),
         StructField("previous", IntegerType(), True),
         StructField("poutcome", StringType(), True),
         StructField("emp.var.rate", FloatType(), True),
         StructField("cons.price.idx", FloatType(), True),
         StructField("cons.conf.idx", FloatType(), True),
         StructField("euribor3m", FloatType(), True),
         StructField("nr.employed", FloatType(), True),
         StructField("y", StringType(), True),
    
])
# load dataset from hdfs
#bank_df = spark.read.csv("hdfs://localhost:9820/proyek/bank-additional-clean.csv", header=True, schema=schema);

bank_df = spark.read.csv("bank-additional-clean.csv", header=True, schema=schema);


In [4]:
bank_df.show(5)
bank_df.printSchema()

+---+---------+-------+-----------+-------+----+---------+-----+-----------+--------+--------+--------+-----------+------------+--------------+-------------+---------+-----------+---+
|age|      job|marital|  education|housing|loan|  contact|month|day_of_week|duration|campaign|previous|   poutcome|emp.var.rate|cons.price.idx|cons.conf.idx|euribor3m|nr.employed|  y|
+---+---------+-------+-----------+-------+----+---------+-----+-----------+--------+--------+--------+-----------+------------+--------------+-------------+---------+-----------+---+
| 56|housemaid|married|   basic.4y|     no|  no|telephone|  may|        mon|     261|       1|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|     5191.0| no|
| 57| services|married|high.school|     no|  no|telephone|  may|        mon|     149|       1|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|     5191.0| no|
| 37| services|married|high.school|    yes|  no|telephone|  may|        mon|    

In [5]:
# drop column with float type
bank = bank_df.drop("emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed")

In [6]:
bank.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [7]:
# change csv file to parquet hadoop format file and save it in hdfs

#bank_df.write.parquet('hdfs://localhost:9820/proyek/bank-additional.parq')
bank.write.parquet('bank-additional.parq')

### Load Parquet Hadoop File Format from HDFS

In [9]:
# load parquet hadoop file format from hdfs

#df = spark.read.parquet("hdfs://localhost:9820/proyek/bank-additional.parq")
df = spark.read.parquet("bank-additional.parq")
df.show(5)
df.printSchema()

+---+---------+-------+-----------+-------+----+---------+-----+-----------+--------+--------+--------+-----------+---+
|age|      job|marital|  education|housing|loan|  contact|month|day_of_week|duration|campaign|previous|   poutcome|  y|
+---+---------+-------+-----------+-------+----+---------+-----+-----------+--------+--------+--------+-----------+---+
| 56|housemaid|married|   basic.4y|     no|  no|telephone|  may|        mon|     261|       1|       0|nonexistent| no|
| 57| services|married|high.school|     no|  no|telephone|  may|        mon|     149|       1|       0|nonexistent| no|
| 37| services|married|high.school|    yes|  no|telephone|  may|        mon|     226|       1|       0|nonexistent| no|
| 40|   admin.|married|   basic.6y|     no|  no|telephone|  may|        mon|     151|       1|       0|nonexistent| no|
| 56| services|married|high.school|     no| yes|telephone|  may|        mon|     307|       1|       0|nonexistent| no|
+---+---------+-------+-----------+-----