In [1]:
from pyspark.sql import SparkSession
import os

home = os.path.expanduser("~")
warehouse = f"file://{os.path.join(home, 'dev/RETrend/tmp/raw/iceberg/phase2_default')}"

spark = (
    SparkSession.builder
    .appName("IcebergHiveCatalogPhase2")
    .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.2")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.hive", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.hive.catalog-impl", "org.apache.iceberg.hive.HiveCatalog")
    .config("spark.sql.catalog.hive.uri", "thrift://localhost:9083")
    .config("spark.sql.catalog.hive.warehouse", warehouse)
    .config("spark.sql.parquet.enableVectorizedReader", "false")
    .config("spark.sql.iceberg.vectorization.enabled", "false")
    .getOrCreate()
)

25/06/29 16:31:24 WARN Utils: Your hostname, daves-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 172.30.1.27 instead (on interface en0)
25/06/29 16:31:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/dave/.ivy2/cache
The jars for the packages stored in: /Users/dave/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-da90cb60-9d01-4cc4-9027-91708f8d8661;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.5.2 in central
:: resolution report :: resolve 40ms :: artifacts dl 1ms
	:: modules in use:
	org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.5.2 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	

:: loading settings :: url = jar:file:/usr/local/spark-3.5.6_2.12/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


25/06/29 16:31:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# spark.sql("DROP DATABASE IF EXISTS hive.phase2 CASCADE")

In [3]:
spark.sql("""
CREATE DATABASE IF NOT EXISTS hive.phase2
LOCATION 'file:/Users/dave/dev/RETrend/tmp/raw/iceberg/phase2_default'
""")

DataFrame[]

In [4]:
spark.sql("DESCRIBE DATABASE EXTENDED hive.phase2").show(truncate=False)

+--------------+---------------------------------------------------------------------------------+
|info_name     |info_value                                                                       |
+--------------+---------------------------------------------------------------------------------+
|Catalog Name  |hive                                                                             |
|Namespace Name|phase2                                                                           |
|Location      |file:/Users/dave/dev/RETrend/tmp/raw/iceberg/phase2_default                      |
|Owner         |dave                                                                             |
|Properties    |((hive.metastore.database.owner,dave), (hive.metastore.database.owner-type,USER))|
+--------------+---------------------------------------------------------------------------------+



In [5]:
from pyspark.sql.types import *

# Parquet 경로
parquet_dir = os.path.join(home, "dev/RETrend/tmp/raw/parquet")

# trade_history 스키마
trade_schema = StructType([
    StructField("tradeType", StringType(), True),
    StructField("tradeYear", LongType(), True),
    StructField("tradeMonth", LongType(), True),
    StructField("tradeDate", LongType(), True),
    StructField("dealPrice", LongType(), True),
    StructField("floor", LongType(), True),
    StructField("representativeArea", DoubleType(), True),
    StructField("exclusiveArea", DoubleType(), True),
    StructField("formattedPrice", StringType(), True),
    StructField("formattedTradeYearMonth", StringType(), True),
    StructField("areaNo", LongType(), True),
    StructField("complexNo", LongType(), True),
    StructField("date", TimestampType(), True),
])

trade_df = (
    spark.read.schema(trade_schema)
        .parquet(f"{parquet_dir}/trade_history_*.parquet")
)

# 테이블 생성
spark.sql("DROP TABLE IF EXISTS hive.phase2.trade_history")
trade_df.writeTo("hive.phase2.trade_history") \
        .using("iceberg") \
        .createOrReplace()


                                                                                

In [6]:
# complex_list 스키마
complex_schema = StructType([
    StructField("complexNo", LongType(), True),
    StructField("complexName", StringType(), True),
    StructField("cortarNo", LongType(), True),
    StructField("realEstateTypeCode", StringType(), True),
    StructField("realEstateTypeName", StringType(), True),
    StructField("detailAddress", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("totalHouseholdCount", LongType(), True),
    StructField("totalBuildingCount", LongType(), True),
    StructField("highFloor", LongType(), True),
    StructField("lowFloor", LongType(), True),
    StructField("useApproveYmd", StringType(), True),
    StructField("dealCount", LongType(), True),
    StructField("leaseCount", LongType(), True),
    StructField("rentCount", LongType(), True),
    StructField("shortTermRentCount", LongType(), True),
    StructField("isInterest", BooleanType(), True),
    StructField("cortarAddress", StringType(), True),
    StructField("tourExist", BooleanType(), True),
    StructField("eupmeandongCortarNo", LongType(), True),
    StructField("eupmeandongCortarName", StringType(), True),
])

complex_df = (
    spark.read.schema(complex_schema)
        .parquet(f"{parquet_dir}/complex_list.parquet")
)

spark.sql("DROP TABLE IF EXISTS hive.phase2.complex_info")
complex_df.writeTo("hive.phase2.complex_info") \
          .using("iceberg") \
          .createOrReplace()


In [7]:
spark.sql("SHOW TABLES IN hive.phase2").show()

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
|   phase2|trade_history|      false|
|   phase2| complex_info|      false|
+---------+-------------+-----------+



In [8]:
joined_df = spark.sql("""
  SELECT
    t.date,
    t.tradeType,
    t.dealPrice,
    t.floor,
    t.representativeArea,
    t.exclusiveArea,
    t.formattedPrice,
    t.formattedTradeYearMonth,
    c.complexName,
    c.latitude,
    c.longitude
  FROM hive.phase2.trade_history t
  LEFT JOIN hive.phase2.complex_info c
    ON t.complexNo = c.complexNo
  ORDER BY t.date DESC
  LIMIT 20
""")
joined_df.show(truncate=False)

+-------------------+---------+---------+-----+------------------+-------------+--------------+-----------------------+------------------------------+---------+----------+
|date               |tradeType|dealPrice|floor|representativeArea|exclusiveArea|formattedPrice|formattedTradeYearMonth|complexName                   |latitude |longitude |
+-------------------+---------+---------+-----+------------------+-------------+--------------+-----------------------+------------------------------+---------+----------+
|2025-06-27 09:00:00|A1       |56500    |16   |0.0               |0.0          |5억 6,500     |2025-06-27             |신원마을호반베르디움9단지     |37.661768|126.885058|
|2025-06-27 09:00:00|A1       |27600    |4    |0.0               |0.0          |2억 7,600     |2025-06-27             |푸른3단지동익미라벨           |37.710058|126.902901|
|2025-06-27 09:00:00|A1       |71800    |14   |0.0               |0.0          |7억 1,800     |2025-06-27             |DMC리슈빌더포레스트           |37.593011|126.888972|
