In [3]:
!pip3 install pandas
!pip3 install duckdb

Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.2.2-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ------ --------------------------------- 1.8/11.6 MB 10.0 MB/s eta 0:00:01
   ------------- -------------------------- 3.9/11.6 MB 9.8 MB/s eta 0:00:01
   -------------------- ------------------- 6.0/11.6 MB 10.0 MB/s eta 0:00:01
   --------------------------- ------------ 7.9/11.6 MB 9.9 MB/s eta 0:00:01
   ---------------------------------- ----- 10.0/11.6 MB 9.8 MB/s eta 0:00:01
   ---------------------------------------  11.5/11.6 MB 9.9 MB/s eta 0:00:0

In [5]:
# to install: pip install duckdb
import pandas as pd
import duckdb

mydf = pd.DataFrame({'a' : [1, 2, 3]})
print(duckdb.query("SELECT sum(a) FROM mydf").to_df())

   sum(a)
0     6.0


In [13]:
import os
testdata_path = 'D:\Benutzer\Cuong.VoTa\datasets\claims_data'
insurants_path = os.path.join(testdata_path, 'sle.insurants.csv')

In [18]:
con = duckdb.connect()
insurants_duckdb = con.read_csv(
    insurants_path,
    delimiter='\t',
    names=['pid', 'year of birth', 'gender'],
    dtype={
        'pid': int,
        'year of birth': 'Int64',
        'gender': 'Int64',
    },
)

# Correct SQL query
query = "SELECT * FROM insurants_duckdb WHERE gender = 1"

# Execute the query
result = con.execute(query).fetch_df()

# Print the result
print(result)

          pid  year of birth  gender
0       10940           1963       1
1       52745           1972       1
2       82797           1982       1
3      125486           1969       1
4      241529           1948       1
...       ...            ...     ...
4985  4137946           1976       1
4986  4143244           1981       1
4987  4151318           1978       1
4988  4167894           1990       1
4989  4177740           1949       1

[4990 rows x 3 columns]


In [19]:
# Show all tables from the new duckdb database
# Connect to the database
con = duckdb.connect(database='claims_data.duckdb', read_only=True)

# List tables
print(con.execute("SHOW TABLES").fetchall())

# Query a table
print(con.execute("SELECT * FROM insurants LIMIT 5").fetchdf())

# Close the connection
con.close()

[('drugs',), ('inpatient_cases',), ('inpatient_diagnosis',), ('inpatient_fees',), ('inpatient_procedures',), ('insurance_data',), ('insurants',), ('outpatient_cases',), ('outpatient_diagnosis',), ('outpatient_fees',), ('outpatient_procedures',)]
      pid  insurants_year_of_birth  insurants_gender
0   10940                     1963                 1
1   52745                     1972                 1
2   82797                     1982                 1
3  125486                     1969                 1
4  205240                     1994                 2


In [None]:
# Connect to the DuckDB database
con = duckdb.connect(database='claims_data.duckdb', read_only=True)

# Query to list all tables
tables = con.execute("SHOW TABLES").fetchall()

print("Tables and their columns:\n")

# Loop through tables to retrieve columns and types
for table in tables:
    table_name = table[0]
    print(f"Table: {table_name}")

    # Query to list columns and their types for the current table
    columns_info = con.execute(f"DESCRIBE {table_name}").fetchall()

    for column in columns_info:
        column_name, column_type, *_ = column
        print(f"  - {column_name}: {column_type}")

    print()

# Close the connection
con.close()

Tables and their columns:

Table: drugs
  - pid: BIGINT
  - drugs_date_of_prescription: TIMESTAMP_NS
  - drugs_date_of_dispense: TIMESTAMP_NS
  - drugs_pharma_central_number: VARCHAR
  - drugs_specialty_of_prescriber: VARCHAR
  - drugs_physican_code: VARCHAR
  - drugs_practice_code: VARCHAR
  - drugs_quantity: DOUBLE
  - drugs_amount_due: DOUBLE
  - drugs_atc: VARCHAR
  - drugs_ddd: DOUBLE

Table: inpatient_cases
  - pid: BIGINT
  - inpatient_caseID: BIGINT
  - inpatient_cases_date_of_admission: TIMESTAMP_NS
  - inpatient_cases_date_of_discharge: TIMESTAMP_NS
  - inpatient_cases_cause_of_admission: VARCHAR
  - inpatient_cases_cause_of_discharge: VARCHAR
  - inpatient_cases_outpatient_treatment: BIGINT
  - inpatient_cases_department_admission: VARCHAR
  - inpatient_cases_department_discharge: VARCHAR

Table: inpatient_diagnosis
  - pid: BIGINT
  - inpatient_caseID: BIGINT
  - inpatient_diagnosis_diagnosis: VARCHAR
  - inpatient_diagnosis_type_of_diagnosis: VARCHAR
  - inpatient_diagnosi

# Sort tables by pid for further use

In [2]:
import duckdb

def sort_tables_by_pid(database_path):
    """
    Sorts all tables in the DuckDB database by their 'pid' column and saves the order persistently.

    Parameters:
        database_path (str): Path to the DuckDB database file.

    Returns:
        None
    """
    # Connect to the DuckDB database
    con = duckdb.connect(database=database_path, read_only=False)

    # Get a list of all tables
    tables = con.execute("SHOW TABLES").fetchall()

    for table in tables:
        table_name = table[0]

        # Check if the table has a 'pid' column
        columns = con.execute(f"DESCRIBE {table_name}").fetchall()
        column_names = [col[0] for col in columns]

        if 'pid' not in column_names:
            print(f"Skipping table '{table_name}' as it does not have a 'pid' column.")
            continue

        # Sort the table by 'pid' and overwrite it
        con.execute(f"""
            CREATE OR REPLACE TABLE {table_name} AS
            SELECT * FROM {table_name} ORDER BY pid
        """)
        print(f"Table '{table_name}' sorted by 'pid' and saved persistently.")

    # Close the connection
    con.close()
    print("All tables have been sorted by 'pid' and updated in the database.")

In [3]:
sort_tables_by_pid('claims_data.duckdb')

Table 'drugs' sorted by 'pid' and saved persistently.
Table 'inpatient_cases' sorted by 'pid' and saved persistently.
Table 'inpatient_diagnosis' sorted by 'pid' and saved persistently.
Table 'inpatient_fees' sorted by 'pid' and saved persistently.
Table 'inpatient_procedures' sorted by 'pid' and saved persistently.
Table 'insurance_data' sorted by 'pid' and saved persistently.
Table 'insurants' sorted by 'pid' and saved persistently.
Table 'outpatient_cases' sorted by 'pid' and saved persistently.
Table 'outpatient_diagnosis' sorted by 'pid' and saved persistently.
Table 'outpatient_fees' sorted by 'pid' and saved persistently.
Table 'outpatient_procedures' sorted by 'pid' and saved persistently.
All tables have been sorted by 'pid' and updated in the database.
