In [None]:
# setup anaconda: conda create -n thesis python=3.12
# activate environment: conda activate thesis
# install dependencies: pip install -r requirements.txt
# link tải dữ liệu: https://blobs.duckdb.org/data/tpch-sf100.db, sau đó tạo folder data rồi tải về đó

In [5]:
import duckdb
import os

# 1. Kết nối tới tệp database đã có
# Bằng cách này, dữ liệu được đọc từ đĩa thay vì nạp toàn bộ vào RAM
db_path = '../data/tpch-sf100.db'

# Kiểm tra xem file database có tồn tại không
if os.path.exists(db_path):
    con = duckdb.connect(database=db_path, read_only=True)
    print(f"Connected to database at {db_path}")
else:
    print(f"Database file not found at {db_path}")
    print("Please verify the correct path to your database file.")
    # Create in-memory connection as fallback
    con = duckdb.connect(database=':memory:')

Connected to database at ../data/tpch-sf100.db


In [6]:
# Liệt kê các bảng trong database
con.sql("SHOW TABLES").show()

┌──────────┐
│   name   │
│ varchar  │
├──────────┤
│ customer │
│ lineitem │
│ nation   │
│ orders   │
│ part     │
│ partsupp │
│ region   │
│ supplier │
└──────────┘



In [7]:
# Lấy 5 đơn hàng đầu tiên và tên khách hàng tương ứng
query = """
    SELECT
        c.c_name AS CustomerName,
        o.o_orderdate AS OrderDate,
        o.o_totalprice AS TotalPrice,
        o.o_comment AS OrderComment
    FROM orders o
    JOIN customer c ON o.o_custkey = c.c_custkey
    LIMIT 5;
"""

con.sql(query).show()

┌────────────────────┬────────────┬───────────────┬───────────────────────────────────────────────────────────────────────────┐
│    CustomerName    │ OrderDate  │  TotalPrice   │                               OrderComment                                │
│      varchar       │    date    │ decimal(15,2) │                                  varchar                                  │
├────────────────────┼────────────┼───────────────┼───────────────────────────────────────────────────────────────────────────┤
│ Customer#003689999 │ 1996-01-02 │     224560.83 │ ly express platelets. deposits acc                                        │
│ Customer#007800163 │ 1996-12-01 │      75388.65 │ ve the furiously fluffy dependencies. carefully regular                   │
│ Customer#012331391 │ 1993-10-14 │     255287.36 │  after the asymptotes. instructions cajole after the foxes. carefully unu │
│ Customer#013677602 │ 1995-10-11 │      43119.84 │ st the furiously bold pinto beans. furiously pending

In [8]:
# Chuyển kết quả truy vấn thành Pandas DataFrame
df = con.sql("SELECT * FROM customer LIMIT 10").df()
df

Unnamed: 0,c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment
0,1,Customer#000000001,j5JsirBM9PsCy0O1m,15,25-989-741-2988,711.56,BUILDING,y final requests wake slyly quickly special ac...
1,2,Customer#000000002,487LW1dovn6Q4dMVymKwwLE9OKf3QG,13,23-768-687-3665,121.65,AUTOMOBILE,y carefully regular foxes. slyly regular reque...
2,3,Customer#000000003,fkRGN8nY4pkE,1,11-719-748-3364,7498.12,AUTOMOBILE,fully. carefully silent instructions sleep alo...
3,4,Customer#000000004,4u58h fqkyE,4,14-128-190-5944,2866.83,MACHINERY,sublate. fluffily even instructions are about th
4,5,Customer#000000005,hwBtxkoBF qSW4KrIk5U 2B1AU7H,3,13-750-942-6364,794.47,HOUSEHOLD,equests haggle furiously against the pending p...
5,6,Customer#000000006,"g1s,pzDenUEBW3O,2 pxu0f9n2g64rJrt5E",20,30-114-968-4951,7638.57,AUTOMOBILE,quickly silent asymptotes are slyly regular e...
6,7,Customer#000000007,8OkMVLQ1dK6Mbu6WG9 w4pLGQ n7MQ,18,28-190-982-9759,9561.95,AUTOMOBILE,"ounts. ironic, regular accounts sleep. final r..."
7,8,Customer#000000008,"j,pZ,Qp,qtFEo0r0c 92qobZtlhSuOqbE4JGV",17,27-147-574-9335,6819.74,BUILDING,riously final excuses sublate quickly among th...
8,9,Customer#000000009,vgIql8H6zoyuLMFNdAMLyE7 H9,8,18-338-906-3675,8324.07,FURNITURE,ss pinto beans believe slyly quiet deposits-- ...
9,10,Customer#000000010,"Vf mQ6Ug9Ucf5OKGYq fsaX AtfsO7,rwY",5,15-741-346-9870,2753.54,HOUSEHOLD,g quickly after the evenly bold


In [9]:
query_nation_8 = """\n
    SELECT\n
        c_custkey,\n
        c_name,\n
        c_address,\n
        c_nationkey,\n
        c_phone,\n
        c_acctbal,\n
        c_mktsegment,\n
        c_comment\n
    FROM customer\n
    WHERE c_nationkey = 8\n
    LIMIT 5; -- Thêm LIMIT để chỉ lấy 5 dòng, tối ưu bộ nhớ\n
"""
# Truy vấn này giờ đây hiệu quả hơn vì chỉ 5 dòng được xử lý và trả về\n
con.sql(query_nation_8).df()

Unnamed: 0,c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment
0,9,Customer#000000009,vgIql8H6zoyuLMFNdAMLyE7 H9,8,18-338-906-3675,8324.07,FURNITURE,ss pinto beans believe slyly quiet deposits-- ...
1,21,Customer#000000021,42E5BARtqjomD,8,18-902-614-8344,1428.25,MACHINERY,e blithely carefully quick sheaves. furiously ...
2,28,Customer#000000028,"J5tK,OQa07KQSuY5b4qRFYVYRF",8,18-774-241-1462,1007.18,FURNITURE,r accounts are quickly. boldly bold foxes haggle
3,37,Customer#000000037,Um5WbwJ0X8QCOy75y,8,18-385-235-7162,-917.75,FURNITURE,above the slyly regular accounts sleep along t...
4,91,Customer#000000091,9Sce2m BjvDdjQkqMx8UnrUsJkk1IBAvZPTsA,8,18-239-400-3677,4643.14,AUTOMOBILE,yly ironic foxes lose slyly pending asymptotes...


In [10]:
# phân tích data warehouse này: ví dụ có bao nhiêu field, row, bảng, ...
tables = con.sql("SHOW TABLES").df()
for table in tables['name']:
    result = con.sql(f"SELECT COUNT(*) AS row_count FROM {table}").df()
    row_count = result['row_count'][0]
    columns = con.sql(f"DESCRIBE {table}").df()
    column_count = len(columns)
    print(f"Table '{table}' has {row_count} rows and {column_count} columns.")
    


Table 'customer' has 15000000 rows and 8 columns.
Table 'lineitem' has 600037902 rows and 16 columns.
Table 'nation' has 25 rows and 4 columns.
Table 'orders' has 150000000 rows and 9 columns.
Table 'part' has 20000000 rows and 9 columns.
Table 'partsupp' has 80000000 rows and 5 columns.
Table 'region' has 5 rows and 3 columns.
Table 'supplier' has 1000000 rows and 7 columns.
