In [1]:
sc

In [2]:
spark

In [6]:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession,HiveContext

In [7]:
sc.stop()

In [8]:
config = SparkConf().setAppName('SparkHiveSession').setMaster('local[4]')
sc = SparkContext.getOrCreate(conf=config)

In [9]:
sc

In [11]:
#spark Integration with Hive with Spark Session
spark = (SparkSession.builder.appName("pyspark-hive-integration")
         .config('spark.sql.warehouse.dir', '/user/hive/warehouse/')
         .enableHiveSupport().getOrCreate())

In [12]:
spark

In [14]:
spark.sql("show databases").show()

+------------+
|databaseName|
+------------+
|     default|
+------------+



In [16]:
spark.sql("drop database if exists bankking_db").show()

++
||
++
++



In [17]:
spark.sql("create database if not exists banking_db").show()

++
||
++
++



In [18]:
spark.sql("show databases").show()

+------------+
|databaseName|
+------------+
|  banking_db|
|     default|
+------------+



In [19]:
spark.sql("use banking_db").show()

++
||
++
++



In [20]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [26]:
spark.sql("""
CREATE TABLE if not exists banking (age int, balance double,campaign double,contact string,day int,default string,
duration int,education varchar(50), housing varchar(10), job string, loan varchar(10), marital string, month varchar(30),
pdays double,poutcome string,previous int,y varchar(10))
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
tblProperties("skip.header.line.count" = 1)""")

DataFrame[]

In [27]:
spark.sql("show tables").show()

+----------+---------+-----------+
|  database|tableName|isTemporary|
+----------+---------+-----------+
|banking_db|  banking|      false|
+----------+---------+-----------+



In [28]:
spark.sql("describe formatted banking").show()

+--------------------+----------+-------+
|            col_name| data_type|comment|
+--------------------+----------+-------+
|                 age|       int|   null|
|             balance|    double|   null|
|            campaign|    double|   null|
|             contact|    string|   null|
|                 day|       int|   null|
|             default|    string|   null|
|            duration|       int|   null|
|           education|    string|   null|
|             housing|    string|   null|
|                 job|    string|   null|
|                loan|    string|   null|
|             marital|    string|   null|
|               month|    string|   null|
|               pdays|    double|   null|
|            poutcome|    string|   null|
|            previous|       int|   null|
|                   y|    string|   null|
|                    |          |       |
|# Detailed Table ...|          |       |
|            Database|banking_db|       |
+--------------------+----------+-

### DDL method to load data into hive table

In [None]:
#spark.sql("""
#load data local inpath 'local file path/filename.csv' overwrite into table banking
#""")

### Create a spark DataFrame

In [29]:
from pyspark.sql.types import *

In [30]:
banking_customer_data = spark.read.json("file:///home/hadoop/Downloads/bank_edited.json", multiLine=True) 

In [32]:
banking_customer_data.show()

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan| marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no| married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no|  single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes| married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no| married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown| 

In [33]:
banking_customer_data.printSchema()

root
 |-- age: long (nullable = true)
 |-- balance: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: long (nullable = true)
 |-- default: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- education: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- job: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- month: string (nullable = true)
 |-- pdays: long (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- previous: long (nullable = true)
 |-- y: string (nullable = true)



### Insert Records from Spark Temp Table to Hive Table

In [34]:
banking_customer_data.createOrReplaceTempView('banktable')

In [35]:
spark.sql("""
insert into table banking
select * from banktable 
""")

DataFrame[]

In [36]:
spark.sql("select * from banking").show()

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan| marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
| 58| 2143.0|     1.0|unknown|  5|     no|     261| tertiary|    yes|  management|  no| married|  may| -1.0| unknown|       0| no|
| 44|   29.0|     1.0|unknown|  5|     no|     151|secondary|    yes|  technician|  no|  single|  may| -1.0| unknown|       0| no|
| 33|    2.0|     1.0|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes| married|  may| -1.0| unknown|       0| no|
| 47| 1506.0|     1.0|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no| married|  may| -1.0| unknown|       0| no|
| 33|    1.0|     1.0|unknown|  5|     no|     198|  unknown|     no|     unknown| 