In [1]:
%%time

import pathlib
from os.path import expanduser, join, abspath

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as fn
from pyspark.sql.types import *
from pyspark.sql.window import Window


warehouse_location = abspath('/home/jovyan/work/hive-db/spark-warehouse')
conf_metastore_db = ("spark.driver.extraJavaOptions", "-Dderby.system.home=/home/jovyan/work/hive-db")
# https://www.ibm.com/support/knowledgecenter/en/SS3H8V_1.1.0/com.ibm.izoda.v1r1.azka100/topics/azkic_t_updconfigfiles.htm

spark = SparkSession \
        .builder \
        .config("spark.sql.warehouse.dir", warehouse_location) \
        .config(*conf_metastore_db) \
        .enableHiveSupport() \
        .appName("local-test") \
        .getOrCreate()

spark

CPU times: user 466 ms, sys: 211 ms, total: 678 ms
Wall time: 6.61 s


In [2]:
import seaborn as sns
import pandas as pd
import numpy as np
import dask.dataframe as dd

- テストデータ用意

In [3]:
iris = sns.load_dataset('iris')
df_iris = spark.createDataFrame(iris)
df_iris.show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

In [4]:
# 接続テスト
spark.sql("""
create database if not exists tmp
""")

spark.sql("""
show databases
""").show()

+------------+
|databaseName|
+------------+
|     default|
|         tmp|
+------------+



In [5]:
%%time
# DataFrame-APIから永続テーブルに保存
df_iris.write.saveAsTable("tmp.iris")

spark.sql("""
select * from tmp.iris
""").show()

+------------+-----------+------------+-----------+----------+
|sepal_length|sepal_width|petal_length|petal_width|   species|
+------------+-----------+------------+-----------+----------+
|         6.8|        2.8|         4.8|        1.4|versicolor|
|         6.7|        3.0|         5.0|        1.7|versicolor|
|         6.0|        2.9|         4.5|        1.5|versicolor|
|         5.7|        2.6|         3.5|        1.0|versicolor|
|         5.5|        2.4|         3.8|        1.1|versicolor|
|         5.5|        2.4|         3.7|        1.0|versicolor|
|         5.8|        2.7|         3.9|        1.2|versicolor|
|         6.0|        2.7|         5.1|        1.6|versicolor|
|         5.4|        3.0|         4.5|        1.5|versicolor|
|         6.0|        3.4|         4.5|        1.6|versicolor|
|         6.7|        3.1|         4.7|        1.5|versicolor|
|         6.3|        2.3|         4.4|        1.3|versicolor|
|         5.6|        3.0|         4.1|        1.3|vers

- ちゃんと指定した場所にHiveのDBとmetastore_db(derby)両方が設定されていることを確認
    - https://www.ibm.com/support/knowledgecenter/en/SS3H8V_1.1.0/com.ibm.izoda.v1r1.azka100/topics/azkic_t_updconfigfiles.htm

- データベース生成

In [6]:
spark.sql("""
create database if not exists sns
""")

DataFrame[]

- `/path/to/spark-datawarehouse/sns.db/`が生成される

In [7]:
spark.sql("""
create database if not exists ext
location '/home/jovyan/work/hive-db/spark-warehouse/ext.db'
""")

DataFrame[]

In [8]:
spark.sql("""
describe database ext
""").toPandas()



Unnamed: 0,database_description_item,database_description_value
0,Database Name,ext
1,Description,
2,Location,file:/home/jovyan/work/hive-db/spark-warehouse...


In [9]:
spark.sql("""
show databases
""").show()

+------------+
|databaseName|
+------------+
|     default|
|         ext|
|         sns|
|         tmp|
+------------+



In [17]:
#spark.sql("drop database ext") 削除

In [13]:
spark.sql("""
create table if not exists sns.iris_3
using orc
options ("compression"="zlib")
as select * from tmp.iris
""")

spark.sql("""
select * from sns.iris_3
""").show()

+------------+-----------+------------+-----------+----------+
|sepal_length|sepal_width|petal_length|petal_width|   species|
+------------+-----------+------------+-----------+----------+
|         6.8|        2.8|         4.8|        1.4|versicolor|
|         6.7|        3.0|         5.0|        1.7|versicolor|
|         6.0|        2.9|         4.5|        1.5|versicolor|
|         5.7|        2.6|         3.5|        1.0|versicolor|
|         5.5|        2.4|         3.8|        1.1|versicolor|
|         5.5|        2.4|         3.7|        1.0|versicolor|
|         5.8|        2.7|         3.9|        1.2|versicolor|
|         6.0|        2.7|         5.1|        1.6|versicolor|
|         5.4|        3.0|         4.5|        1.5|versicolor|
|         6.0|        3.4|         4.5|        1.6|versicolor|
|         6.7|        3.1|         4.7|        1.5|versicolor|
|         6.3|        2.3|         4.4|        1.3|versicolor|
|         5.6|        3.0|         4.1|        1.3|vers

In [15]:
spark.sql("""
describe table sns.iris_3
""").show()

+------------+---------+-------+
|    col_name|data_type|comment|
+------------+---------+-------+
|sepal_length|   double|   null|
| sepal_width|   double|   null|
|petal_length|   double|   null|
| petal_width|   double|   null|
|     species|   string|   null|
+------------+---------+-------+

