In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

# 경로 설정
home = os.path.expanduser('~')
csv_path      = os.path.join(home, 'nfs_mount', 'trade_history_13337.csv')
parquet_root  = os.path.join(home, 'nfs_mount', 'prquet')

# Parquet 저장 디렉토리 생성
os.makedirs(parquet_root, exist_ok=True)

# CSV 로드 (모든 컬럼 그대로 읽기)
df = pd.read_csv(csv_path)

# 날짜 컬럼 생성
df['date'] = pd.to_datetime(
    df['tradeYear'].astype(str) + '-' +
    df['tradeMonth'].astype(str).str.zfill(2) + '-' +
    df['tradeDate'].astype(str).str.zfill(2)
)

# Parquet 파일로 쓰기 (모든 컬럼, date 포함)
table = pa.Table.from_pandas(df)
out_path = os.path.join(parquet_root, 'trade_history.parquet')
pq.write_table(table, out_path, compression='snappy')

print(f"Parquet 변환 완료: {out_path}")

✅ Parquet 변환 완료: /Users/dave/nfs_mount/prquet/trade_history.parquet


In [3]:
import duckdb
import os

home        = os.path.expanduser('~')
local_db    = os.path.join(home, 'realestate.duckdb')
parquet_path = os.path.join(home, 'nfs_mount', 'parquet', 'trade_history.parquet')

# 로컬 DB 파일에 연결 (read-write 가능)
con = duckdb.connect(local_db)

# staging_raw 테이블 생성
con.execute(f"""
CREATE OR REPLACE TABLE staging_raw AS
SELECT *
FROM read_parquet('{parquet_path}');
""")
print("staging_raw created")

staging_raw created


In [6]:
import duckdb
import os

home         = os.path.expanduser('~')
local_db     = os.path.join(home, 'realestate.duckdb')
parquet_path = os.path.join(home, 'nfs_mount', 'parquet', 'trade_history.parquet')
iceberg_loc  = os.path.join(home, 'nfs_mount', 'iceberg', 'trade_iceberg')

# 1) DuckDB 연결
con = duckdb.connect(local_db)

# 2) Iceberg 확장 설치·로드 (최초 1회만)
con.execute("INSTALL iceberg;")
con.execute("LOAD iceberg;")

# 3) staging_raw 생성
con.execute(f"""
CREATE OR REPLACE TABLE staging_raw AS
SELECT *
FROM read_parquet('{parquet_path}');
""")

# 4) Iceberg 테이블에 쓰기/덮어쓰기 (overwrite 모드)
con.execute(f"""
CALL write_iceberg_table(
  'file',                    -- catalog type: file-based
  '{iceberg_loc}',           -- table location
  'trade_iceberg',           -- table name
  'parquet',                 -- file format
  'date',                    -- partition column
  'staging_raw',             -- source table
  'overwrite'                -- write mode: overwrite existing
);
""")

# 5) 샘플 조회
df = con.execute(f"""
SELECT *
FROM read_iceberg('{iceberg_loc}')
ORDER BY date DESC
LIMIT 5;
""").df()

print(df)


CatalogException: Catalog Error: Table Function with name write_iceberg_table does not exist!
Did you mean "iceberg_metadata"?