In [1]:
import duckdb, glob, os

# ─── 환경 설정 ─────────────────────────────────────────────────
home        = os.path.expanduser("~")
raw         = os.path.join(home, "dev/RETrend/tmp/raw")
parquet_dir = os.path.join(raw, "parquet")
os.makedirs(parquet_dir, exist_ok=True)

con = duckdb.connect()

# ─── (A) trade_history_{complexNo}.csv → Parquet ───────────────
for csv_fp in glob.glob(os.path.join(raw, "trade_history/","trade_history_*.csv")):
    fn        = os.path.basename(csv_fp)
    complexNo = int(fn.split("_")[2].split(".")[0])
    pq_fp     = os.path.join(parquet_dir, f"trade_history_{complexNo}.parquet")

    con.execute(f"""
    COPY (
      SELECT
        tradeType::VARCHAR                         AS tradeType,
        CAST(tradeYear       AS BIGINT)           AS tradeYear,
        CAST(tradeMonth      AS BIGINT)           AS tradeMonth,
        CAST(tradeDate       AS BIGINT)           AS tradeDate,
        CAST(dealPrice       AS BIGINT)           AS dealPrice,
        CAST(floor           AS BIGINT)           AS floor,
        CAST(representativeArea AS DOUBLE)         AS representativeArea,
        CAST(exclusiveArea   AS DOUBLE)            AS exclusiveArea,
        formattedPrice::VARCHAR                     AS formattedPrice,
        formattedTradeYearMonth::VARCHAR            AS formattedTradeYearMonth,
        CAST(areaNo         AS BIGINT)            AS areaNo,
        --deleteYn::VARCHAR                           AS deleteYn,
        CAST({complexNo}    AS BIGINT)             AS complexNo,
        -- 날짜 컬럼: "YYYY-MM-DD 00:00:00" TIMESTAMP
        CAST(
          lpad(tradeYear::VARCHAR,4,'0') || '-' ||
          lpad(tradeMonth::VARCHAR,2,'0') || '-' ||
          lpad(tradeDate::VARCHAR,2,'0')
          AS TIMESTAMP
        ) AS date
      FROM read_csv_auto('{csv_fp}')
    ) 
    TO '{pq_fp}' (FORMAT PARQUET, COMPRESSION 'snappy');
    """)
    print("✅ CSV→Parquet:", fn)

# ─── (B) complex_list.csv → Parquet ────────────────────────────
complex_csv = os.path.join(raw, "complex_list.csv")
complex_pq  = os.path.join(parquet_dir, "complex_list.parquet")

con.execute(f"""
COPY (
  SELECT
    CAST(complexNo              AS BIGINT)  AS complexNo,
    complexName::VARCHAR                       AS complexName,
    CAST(cortarNo               AS BIGINT)    AS cortarNo,
    realEstateTypeCode::VARCHAR                AS realEstateTypeCode,
    realEstateTypeName::VARCHAR                AS realEstateTypeName,
    detailAddress::VARCHAR                     AS detailAddress,
    CAST(latitude              AS DOUBLE)     AS latitude,
    CAST(longitude             AS DOUBLE)     AS longitude,
    CAST(totalHouseholdCount   AS BIGINT)    AS totalHouseholdCount,
    CAST(totalBuildingCount    AS BIGINT)    AS totalBuildingCount,
    CAST(highFloor             AS BIGINT)    AS highFloor,
    CAST(lowFloor              AS BIGINT)    AS lowFloor,
    useApproveYmd::VARCHAR                     AS useApproveYmd,
    CAST(dealCount             AS BIGINT)    AS dealCount,
    CAST(leaseCount            AS BIGINT)    AS leaseCount,
    CAST(rentCount             AS BIGINT)    AS rentCount,
    CAST(shortTermRentCount    AS BIGINT)    AS shortTermRentCount,
    CAST(isInterest            AS BOOLEAN)    AS isInterest,
    cortarAddress::VARCHAR                      AS cortarAddress,
    CAST(tourExist             AS BOOLEAN)    AS tourExist,
    CAST(eupmeandongCortarNo   AS BIGINT)     AS eupmeandongCortarNo,
    eupmeandongCortarName::VARCHAR              AS eupmeandongCortarName
  FROM read_csv_auto('{complex_csv}')
)
TO '{complex_pq}' (FORMAT PARQUET, COMPRESSION 'snappy');
""")
print("✅ CSV→Parquet: complex_list.csv →", complex_pq)

con.close()


✅ CSV→Parquet: trade_history_1524.csv
✅ CSV→Parquet: trade_history_26503.csv
✅ CSV→Parquet: trade_history_1530.csv
✅ CSV→Parquet: trade_history_1493.csv
✅ CSV→Parquet: trade_history_1487.csv
✅ CSV→Parquet: trade_history_27796.csv
✅ CSV→Parquet: trade_history_110497.csv
✅ CSV→Parquet: trade_history_14639.csv
✅ CSV→Parquet: trade_history_110680.csv
✅ CSV→Parquet: trade_history_1486.csv
✅ CSV→Parquet: trade_history_343.csv
✅ CSV→Parquet: trade_history_1492.csv
✅ CSV→Parquet: trade_history_9970.csv
✅ CSV→Parquet: trade_history_2602.csv
✅ CSV→Parquet: trade_history_8461.csv
✅ CSV→Parquet: trade_history_8307.csv
✅ CSV→Parquet: trade_history_26502.csv
✅ CSV→Parquet: trade_history_1531.csv
✅ CSV→Parquet: trade_history_100256.csv
✅ CSV→Parquet: trade_history_27391.csv
✅ CSV→Parquet: trade_history_1533.csv
✅ CSV→Parquet: trade_history_10071.csv
✅ CSV→Parquet: trade_history_13578.csv
✅ CSV→Parquet: trade_history_2600.csv
✅ CSV→Parquet: trade_history_1484.csv
✅ CSV→Parquet: trade_history_1490.csv


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

home        = os.path.expanduser("~")
raw         = os.path.join(home, "dev/RETrend/tmp/raw")
parquet_dir = os.path.join(raw, "parquet")
iceberg_ws  = f"file://{os.path.join(raw, 'iceberg')}"

spark = (
    SparkSession.builder
    .appName("IcebergTest")
    .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.2")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.spark_catalog.type", "hadoop")
    .config("spark.sql.catalog.spark_catalog.warehouse", "/Users/dave/dev/RETrend/tmp/raw/iceberg")
    .config("spark.sql.catalog.spark_catalog.write.metadata.version_hint.enabled", "true")
    .config("spark.sql.parquet.enableVectorizedReader", "false")
    .config("spark.sql.iceberg.vectorization.enabled", "false")
    .getOrCreate()
)

log4j = spark._jvm.org.apache.log4j
log4j.LogManager \
     .getLogger("org.apache.iceberg.hadoop.HadoopTableOperations") \
     .setLevel(log4j.Level.ERROR)


# ─── (A) trade_history → Iceberg ─────────────────────────────────
trade_schema = StructType([
    StructField("tradeType",                StringType(),   True),
    StructField("tradeYear",                LongType(),  True),
    StructField("tradeMonth",               LongType(),  True),
    StructField("tradeDate",                LongType(),  True),
    StructField("dealPrice",                LongType(),  True),
    StructField("floor",                    LongType(),  True),
    StructField("representativeArea",       DoubleType(),   True),
    StructField("exclusiveArea",            DoubleType(),   True),
    StructField("formattedPrice",           StringType(),   True),
    StructField("formattedTradeYearMonth",  StringType(),   True),
    StructField("areaNo",                   LongType(),  True),
    #StructField("deleteYn",                 StringType(),   True),
    StructField("complexNo",                LongType(),  True),
    StructField("date",                     TimestampType(),True),
])

trade_df = (
    spark.read.schema(trade_schema)
         .parquet(f"{parquet_dir}/trade_history_*.parquet")
)

spark.sql("DROP TABLE IF EXISTS spark_catalog.default.trade_history")
trade_df.writeTo("spark_catalog.default.trade_history") \
        .using("iceberg") \
        .createOrReplace()

# ─── (B) complex_list → Iceberg ─────────────────────────────────
complex_schema = StructType([
    StructField("complexNo",             LongType(), True),
    StructField("complexName",           StringType(),  True),
    StructField("cortarNo",              LongType(),    True),
    StructField("realEstateTypeCode",    StringType(),  True),
    StructField("realEstateTypeName",    StringType(),  True),
    StructField("detailAddress",         StringType(),  True),
    StructField("latitude",              DoubleType(),  True),
    StructField("longitude",             DoubleType(),  True),
    StructField("totalHouseholdCount",   LongType(), True),
    StructField("totalBuildingCount",    LongType(), True),
    StructField("highFloor",             LongType(), True),
    StructField("lowFloor",              LongType(), True),
    StructField("useApproveYmd",         StringType(),  True),
    StructField("dealCount",             LongType(), True),
    StructField("leaseCount",            LongType(), True),
    StructField("rentCount",             LongType(), True),
    StructField("shortTermRentCount",    LongType(), True),
    StructField("isInterest",            BooleanType(), True),
    StructField("cortarAddress",         StringType(),  True),
    StructField("tourExist",             BooleanType(), True),
    StructField("eupmeandongCortarNo",   LongType(),    True),
    StructField("eupmeandongCortarName", StringType(),  True),
])

complex_df = (
    spark.read.schema(complex_schema)
         .parquet(f"{parquet_dir}/complex_list.parquet")
)

spark.sql("DROP TABLE IF EXISTS spark_catalog.default.complex_info")
complex_df.writeTo("spark_catalog.default.complex_info") \
          .using("iceberg") \
          .createOrReplace()

# spark.stop()
print("✅ Parquet → Iceberg 완료")


25/06/28 17:59:17 WARN Utils: Your hostname, daves-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.0.0.2 instead (on interface en0)
25/06/28 17:59:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/dave/.ivy2/cache
The jars for the packages stored in: /Users/dave/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a11ff6eb-507a-4430-b076-d2611ba14b0a;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.5.2 in central
:: resolution report :: resolve 40ms :: artifacts dl 1ms
	:: modules in use:
	org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.5.2 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	--

:: loading settings :: url = jar:file:/usr/local/spark-3.5.6_2.12/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


25/06/28 17:59:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

✅ Parquet → Iceberg 완료


In [3]:
# Python API
spark.sql("SELECT current_catalog() AS cat, current_database() AS db").show()
spark.sql("SHOW TABLES").show(truncate=False)

+-------------+-------+
|          cat|     db|
+-------------+-------+
|spark_catalog|default|
+-------------+-------+

+---------+----------------+-----------+
|namespace|tableName       |isTemporary|
+---------+----------------+-----------+
|default  |complex_info    |false      |
|default  |trade_history   |false      |
|default  |complex_location|false      |
|default  |trade_iceberg   |false      |
+---------+----------------+-----------+



In [4]:
spark.sql("SELECT count(*) from complex_info limit 5").show()

+--------+
|count(1)|
+--------+
|   47884|
+--------+



In [5]:
spark.sql("SELECT count(*) from trade_history limit 5").show()

+--------+
|count(1)|
+--------+
|  101238|
+--------+



In [6]:
joined_df = spark.sql("""
  SELECT
    t.date,
    t.tradeType,
    t.dealPrice,
    t.floor,
    t.representativeArea,
    t.exclusiveArea,
    t.formattedPrice,
    t.formattedTradeYearMonth,
    c.complexName,
    c.latitude,
    c.longitude
  FROM trade_history t
  LEFT JOIN complex_info c
    ON t.complexNo = c.complexNo
  ORDER BY t.date DESC
  LIMIT 20
""")
joined_df.show(truncate=False)

+-------------------+---------+---------+-----+------------------+-------------+--------------+-----------------------+------------------------------+---------+----------+
|date               |tradeType|dealPrice|floor|representativeArea|exclusiveArea|formattedPrice|formattedTradeYearMonth|complexName                   |latitude |longitude |
+-------------------+---------+---------+-----+------------------+-------------+--------------+-----------------------+------------------------------+---------+----------+
|2025-06-27 09:00:00|A1       |56500    |16   |0.0               |0.0          |5억 6,500     |2025-06-27             |신원마을호반베르디움9단지     |37.661768|126.885058|
|2025-06-27 09:00:00|A1       |27600    |4    |0.0               |0.0          |2억 7,600     |2025-06-27             |푸른3단지동익미라벨           |37.710058|126.902901|
|2025-06-27 09:00:00|A1       |71800    |14   |0.0               |0.0          |7억 1,800     |2025-06-27             |DMC리슈빌더포레스트           |37.593011|126.888972|
