In [None]:
# !pip install duckdb
import duckdb
import os

# 1. 连接到内存数据库
conn = duckdb.connect(':memory:')

# 2. 安装并加载 tpch 扩展
print("Installing and loading TPC-H extension...")
conn.execute("INSTALL tpch;")
conn.execute("LOAD tpch;")

# 3. 定义规模因子
scale_factor = 0.1 # 选了scale_factor = 0.1，1，10，20来生成四个tpch_data文件夹做测试
print(f"Generating TPC-H data with scale factor {scale_factor}...")

# 4. 生成数据
conn.execute(f"CALL dbgen(sf={scale_factor});")
print("Data generation complete.")

# 5. 获取表名列表
tables_result = conn.execute("SHOW TABLES;").fetchall()
table_names = [row[0] for row in tables_result]
print(f"Generated tables: {table_names}")

# 6. 定义导出目录
script_dir = os.getcwd() # 获取当前工作目录
# 直接指定一个绝对路径：
# script_dir = "D:/hkust/summer term/independent projects_flink/flink_project/flink_test"

project_root = script_dir
output_dir = os.path.join(project_root, "tpch_data4")

os.makedirs(output_dir, exist_ok=True)
print(f"Exporting data to directory: {output_dir}")

# 7. 导出每个表为 Parquet 文件
for table_name in table_names:
    parquet_file_path = os.path.join(output_dir, f"{table_name}.parquet")
    print(f"Exporting {table_name} to {parquet_file_path}...")
    conn.execute(f"COPY {table_name} TO '{parquet_file_path}' (FORMAT PARQUET);")
    print(f"Exported {table_name}.")

# 8. 关闭连接
conn.close()
print("All done! Parquet files are in the './tpch_data' directory.")