In [2]:
!pip3 install duckdb



In [80]:
import duckdb
from pathlib import Path

In [97]:

# db = Path("data/duckdb/claims_data_preprocessed.duckdb")
db = Path("D:/Projekte/202305_Synthetic_data/cuong/eval-synth-data-privacy/data/duckdb/claims_data_preprocessed.duckdb")
if db.is_file():
    print('Connecting...')
    conn = duckdb.connect(database=db) 
    
# Retrieve the list of tables in the current schema
tables = conn.execute("SHOW TABLES").fetchall()
print("\nList of tables:")
for table in tables:
    # Each row is a tuple; the table name is typically the first element
    table_name = table[0]
    row_count = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
    print(f"{table_name} ({row_count} rows)")

Connecting...

List of tables:
drugs (992594 rows)
inpatient_cases (54967 rows)
inpatient_diagnosis (388321 rows)
inpatient_fees (337770 rows)
inpatient_procedures (117374 rows)
insurance_data (67714 rows)
insurants (6743 rows)
outpatient_cases (774466 rows)
outpatient_diagnosis (3609070 rows)
outpatient_fees (6353548 rows)
outpatient_procedures (9803 rows)


In [98]:
# Query to join insurants with insurance data and filter for 2014 end dates
# Create a new table from the join query
query = """
CREATE OR REPLACE TABLE join_2014 AS
SELECT *
FROM
    insurants 
JOIN
    insurance_data USING (pid)
WHERE
    EXTRACT(YEAR FROM insurance_data_to) = 2014
ORDER BY
    pid, insurance_data_from
"""

# Execute the query to create the table
conn.execute(query)

# Verify the table was created and count the rows
count_query = "SELECT COUNT(*) FROM join_2014"
row_count = conn.execute(count_query).fetchone()[0]

print(f"Table 'join_2014' created successfully with {row_count} rows")

# Optional: Show a sample of the data
sample_query = "SELECT * FROM join_2014 LIMIT 5"
sample_df = conn.execute(sample_query).fetchdf()
display(sample_df)

# Close the connection
conn.close()

Table 'join_2014' created successfully with 9034 rows


Unnamed: 0,pid,insurants_year_of_birth,insurants_gender,insurance_data_from,insurance_data_to,insurance_data_death,insurance_data_regional_code
0,182.0,1961.0,1,2014-01-01,2014-12-31,0,2
1,1109.0,1963.0,1,2014-01-01,2014-10-24,1,99
2,1206.0,1981.0,1,2014-01-01,2014-12-31,0,15
3,1991.0,1967.0,1,2014-01-01,2014-12-31,0,14
4,2838.0,1994.0,1,2014-01-01,2014-07-03,0,5


# Big join of all inpatient tables

In [109]:
conn = duckdb.connect(database=db)

# Create comprehensive inpatient data join in a single query
create_table_query = """
CREATE OR REPLACE TABLE join_2014_inpatient AS
SELECT 
    ins.pid,
    ins.insurants_year_of_birth,
    ins.insurants_gender,
    insd.insurance_data_from,
    insd.insurance_data_to,
    insd.insurance_data_death,
    insd.insurance_data_regional_code,
    ic.inpatient_caseID,
    ic.inpatient_cases_date_of_admission,
    ic.inpatient_cases_date_of_discharge,
    ic.inpatient_cases_cause_of_admission,
    ic.inpatient_cases_cause_of_discharge,
    ic.inpatient_cases_outpatient_treatment,
    ic.inpatient_cases_department_admission,
    ic.inpatient_cases_department_discharge,
    id.inpatient_diagnosis_diagnosis,
    id.inpatient_diagnosis_type_of_diagnosis,
    id.inpatient_diagnosis_is_main_diagnosis,
    id.inpatient_diagnosis_localisation,
    ip.inpatient_procedures_procedure_code,
    ip.inpatient_procedures_localisation,
    ip.inpatient_procedures_date_of_procedure,
    ifees.inpatient_fees_billing_code,
    ifees.inpatient_fees_amount_due,
    ifees.inpatient_fees_quantity,
    ifees.inpatient_fees_from,
    ifees.inpatient_fees_to
FROM 
    insurants ins
JOIN 
    insurance_data insd ON ins.pid = insd.pid
    AND EXTRACT(YEAR FROM insd.insurance_data_from) <= 2014 
    AND (insd.insurance_data_to IS NULL OR EXTRACT(YEAR FROM insd.insurance_data_to) >= 2014)
LEFT JOIN 
    inpatient_cases ic ON ins.pid = ic.pid
    AND (
        ic.inpatient_cases_date_of_discharge IS NULL
        OR ic.inpatient_cases_date_of_discharge BETWEEN insd.insurance_data_from AND insd.insurance_data_to
    )
LEFT JOIN 
    inpatient_diagnosis id ON ic.pid = id.pid AND ic.inpatient_caseID = id.inpatient_caseID
LEFT JOIN 
    inpatient_procedures ip ON ic.pid = ip.pid AND ic.inpatient_caseID = ip.inpatient_caseID
    AND ip.inpatient_procedures_date_of_procedure BETWEEN insd.insurance_data_from AND insd.insurance_data_to
LEFT JOIN 
    inpatient_fees ifees ON ic.pid = ifees.pid AND ic.inpatient_caseID = ifees.inpatient_caseID
    AND ifees.inpatient_fees_to BETWEEN insd.insurance_data_from AND insd.insurance_data_to
ORDER BY 
    ins.pid, 
    ic.inpatient_caseID, 
    id.inpatient_diagnosis_is_main_diagnosis DESC,
    ip.inpatient_procedures_date_of_procedure,
    ifees.inpatient_fees_from
"""

# Execute the query to create the table
conn.execute(create_table_query)

# Get comprehensive statistics about the join
statistics_query = """
SELECT
    COUNT(*) as total_rows,
    COUNT(DISTINCT pid) as unique_patients,
    COUNT(DISTINCT inpatient_caseID) as unique_cases,
    SUM(CASE WHEN inpatient_caseID IS NOT NULL THEN 1 ELSE 0 END) as rows_with_cases,
    SUM(CASE WHEN inpatient_diagnosis_diagnosis IS NOT NULL THEN 1 ELSE 0 END) as rows_with_diagnoses,
    SUM(CASE WHEN inpatient_procedures_procedure_code IS NOT NULL THEN 1 ELSE 0 END) as rows_with_procedures,
    SUM(CASE WHEN inpatient_fees_billing_code IS NOT NULL THEN 1 ELSE 0 END) as rows_with_fees,
    COUNT(DISTINCT CASE WHEN inpatient_caseID IS NOT NULL THEN pid END) as patients_with_cases,
    COUNT(DISTINCT CASE WHEN inpatient_diagnosis_diagnosis IS NOT NULL THEN pid END) as patients_with_diagnoses,
    COUNT(DISTINCT CASE WHEN inpatient_procedures_procedure_code IS NOT NULL THEN pid END) as patients_with_procedures,
    COUNT(DISTINCT CASE WHEN inpatient_fees_billing_code IS NOT NULL THEN pid END) as patients_with_fees
FROM
    join_2014_inpatient
"""
stats = conn.execute(statistics_query).fetchdf()
display(stats)

# Show a sample of the data
sample_query = """
SELECT * FROM join_2014_inpatient
LIMIT 10
"""
sample_df = conn.execute(sample_query).fetchdf()
display(sample_df)

# Close the connection
conn.close()

Unnamed: 0,total_rows,unique_patients,unique_cases,rows_with_cases,rows_with_diagnoses,rows_with_procedures,rows_with_fees,patients_with_cases,patients_with_diagnoses,patients_with_procedures,patients_with_fees
0,211755,6448,6629,205467.0,205467.0,196947.0,0.0,2523,2523,1682,0


Unnamed: 0,pid,insurants_year_of_birth,insurants_gender,insurance_data_from,insurance_data_to,insurance_data_death,insurance_data_regional_code,inpatient_caseID,inpatient_cases_date_of_admission,inpatient_cases_date_of_discharge,...,inpatient_diagnosis_is_main_diagnosis,inpatient_diagnosis_localisation,inpatient_procedures_procedure_code,inpatient_procedures_localisation,inpatient_procedures_date_of_procedure,inpatient_fees_billing_code,inpatient_fees_amount_due,inpatient_fees_quantity,inpatient_fees_from,inpatient_fees_to
0,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,,NaT,NaT,...,,,,,NaT,,,,NaT,NaT
1,1109.0,1963.0,1,2014-01-01,2014-10-24,1,99,4217621.0,2014-09-08,2014-09-10,...,1.0,0.0,3-820,9.0,2014-09-09,,,,NaT,NaT
2,1109.0,1963.0,1,2014-01-01,2014-10-24,1,99,4217621.0,2014-09-08,2014-09-10,...,1.0,0.0,1-2070,9.0,2014-09-09,,,,NaT,NaT
3,1109.0,1963.0,1,2014-01-01,2014-10-24,1,99,4217621.0,2014-09-08,2014-09-10,...,0.0,0.0,3-820,9.0,2014-09-09,,,,NaT,NaT
4,1109.0,1963.0,1,2014-01-01,2014-10-24,1,99,4217621.0,2014-09-08,2014-09-10,...,0.0,0.0,3-820,9.0,2014-09-09,,,,NaT,NaT
5,1109.0,1963.0,1,2014-01-01,2014-10-24,1,99,4217621.0,2014-09-08,2014-09-10,...,0.0,0.0,3-820,9.0,2014-09-09,,,,NaT,NaT
6,1109.0,1963.0,1,2014-01-01,2014-10-24,1,99,4217621.0,2014-09-08,2014-09-10,...,0.0,0.0,3-820,9.0,2014-09-09,,,,NaT,NaT
7,1109.0,1963.0,1,2014-01-01,2014-10-24,1,99,4217621.0,2014-09-08,2014-09-10,...,0.0,0.0,1-2070,9.0,2014-09-09,,,,NaT,NaT
8,1109.0,1963.0,1,2014-01-01,2014-10-24,1,99,4217621.0,2014-09-08,2014-09-10,...,0.0,0.0,1-2070,9.0,2014-09-09,,,,NaT,NaT
9,1109.0,1963.0,1,2014-01-01,2014-10-24,1,99,4217621.0,2014-09-08,2014-09-10,...,0.0,0.0,1-2070,9.0,2014-09-09,,,,NaT,NaT


# Big join of all outpatient tables

In [110]:
conn = duckdb.connect(database=db)

# Create comprehensive outpatient data join with 2014 filter
create_table_query = """
CREATE OR REPLACE TABLE join_2014_outpatient AS
SELECT 
    ins.pid,
    ins.insurants_year_of_birth,
    ins.insurants_gender,
    insd.insurance_data_from,
    insd.insurance_data_to,
    insd.insurance_data_death,
    insd.insurance_data_regional_code,
    oc.outpatient_caseID,
    oc.outpatient_cases_practice_code,
    oc.outpatient_cases_from,
    oc.outpatient_cases_to,
    oc.outpatient_cases_amount_due,
    oc.outpatient_cases_year,
    oc.outpatient_cases_quarter,
    od.outpatient_diagnosis_diagnosis,
    od.outpatient_diagnosis_qualification,
    od.outpatient_diagnosis_localisation,
    op.outpatient_procedures_procedure_code,
    op.outpatient_procedures_localisation,
    op.outpatient_procedures_date_of_procedure,
    op.outpatient_procedures_specialty_code,
    op.outpatient_procedures_physician_code,
    ofees.outpatient_fees_physician_code,
    ofees.outpatient_fees_specialty_code,
    ofees.outpatient_fees_billing_code,
    ofees.outpatient_fees_quantity,
    ofees.outpatient_fees_date
FROM 
    insurants ins
JOIN 
    insurance_data insd ON ins.pid = insd.pid
    AND EXTRACT(YEAR FROM insd.insurance_data_from) <= 2014 
    AND (insd.insurance_data_to IS NULL OR EXTRACT(YEAR FROM insd.insurance_data_to) >= 2014)
LEFT JOIN 
    outpatient_cases oc ON ins.pid = oc.pid
    AND oc.outpatient_cases_year = 2014
LEFT JOIN 
    outpatient_diagnosis od ON oc.pid = od.pid AND oc.outpatient_caseID = od.outpatient_caseID
LEFT JOIN 
    outpatient_procedures op ON oc.pid = op.pid AND oc.outpatient_caseID = op.outpatient_caseID
    AND op.outpatient_procedures_date_of_procedure BETWEEN insd.insurance_data_from AND insd.insurance_data_to
LEFT JOIN 
    outpatient_fees ofees ON oc.pid = ofees.pid AND oc.outpatient_caseID = ofees.outpatient_caseID
    AND ofees.outpatient_fees_date BETWEEN insd.insurance_data_from AND insd.insurance_data_to
ORDER BY 
    ins.pid, 
    oc.outpatient_caseID, 
    oc.outpatient_cases_from,
    op.outpatient_procedures_date_of_procedure,
    ofees.outpatient_fees_date
"""

# Execute the query to create the table
conn.execute(create_table_query)

# Get comprehensive statistics about the join
statistics_query = """
SELECT
    COUNT(*) as total_rows,
    COUNT(DISTINCT pid) as unique_patients,
    COUNT(DISTINCT outpatient_caseID) as unique_cases,
    SUM(CASE WHEN outpatient_caseID IS NOT NULL THEN 1 ELSE 0 END) as rows_with_cases,
    SUM(CASE WHEN outpatient_diagnosis_diagnosis IS NOT NULL THEN 1 ELSE 0 END) as rows_with_diagnoses,
    SUM(CASE WHEN outpatient_procedures_procedure_code IS NOT NULL THEN 1 ELSE 0 END) as rows_with_procedures,
    SUM(CASE WHEN outpatient_fees_billing_code IS NOT NULL THEN 1 ELSE 0 END) as rows_with_fees,
    COUNT(DISTINCT CASE WHEN outpatient_caseID IS NOT NULL THEN pid END) as patients_with_cases,
    COUNT(DISTINCT CASE WHEN outpatient_diagnosis_diagnosis IS NOT NULL THEN pid END) as patients_with_diagnoses,
    COUNT(DISTINCT CASE WHEN outpatient_procedures_procedure_code IS NOT NULL THEN pid END) as patients_with_procedures,
    COUNT(DISTINCT CASE WHEN outpatient_fees_billing_code IS NOT NULL THEN pid END) as patients_with_fees
FROM
    join_2014_outpatient
"""
stats = conn.execute(statistics_query).fetchdf()
display(stats)

# Show a sample of the data
sample_query = """
SELECT * FROM join_2014_outpatient
LIMIT 10
"""
sample_df = conn.execute(sample_query).fetchdf()
display(sample_df)

# Close the connection
conn.close()

Unnamed: 0,total_rows,unique_patients,unique_cases,rows_with_cases,rows_with_diagnoses,rows_with_procedures,rows_with_fees,patients_with_cases,patients_with_diagnoses,patients_with_procedures,patients_with_fees
0,1188422,6448,95544,1188247.0,1187933.0,16576.0,1068228.0,6326,6324,663,6326


Unnamed: 0,pid,insurants_year_of_birth,insurants_gender,insurance_data_from,insurance_data_to,insurance_data_death,insurance_data_regional_code,outpatient_caseID,outpatient_cases_practice_code,outpatient_cases_from,...,outpatient_procedures_procedure_code,outpatient_procedures_localisation,outpatient_procedures_date_of_procedure,outpatient_procedures_specialty_code,outpatient_procedures_physician_code,outpatient_fees_physician_code,outpatient_fees_specialty_code,outpatient_fees_billing_code,outpatient_fees_quantity,outpatient_fees_date
0,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,28019780.0,518721492,2014-12-17,...,,,NaT,,,683166854,54,96013,1.0,2014-12-17
1,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,49367502.0,845111840,2014-02-11,...,,,NaT,,,868524702,2,32120,1.0,2014-02-11
2,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,52771724.0,898277745,2014-12-17,...,,,NaT,,,182270362,62,96013,1.0,2014-12-17
3,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,78542989.0,403313187,2014-07-03,...,,,NaT,,,868524702,2,3003,1.0,2014-07-03
4,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,78542989.0,403313187,2014-07-03,...,,,NaT,,,868524702,2,3003,1.0,2014-07-03
5,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,78542989.0,403313187,2014-07-03,...,,,NaT,,,868524702,2,3003,1.0,2014-07-03
6,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,102397572.0,77644261,2014-04-24,...,,,NaT,,,267199921,21,10211,1.0,2014-04-24
7,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,117732737.0,906419405,2014-12-11,...,,,NaT,,,160633651,51,21214,1.0,2014-12-11
8,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,117732737.0,906419405,2014-12-11,...,,,NaT,,,160633651,51,21214,1.0,2014-12-11
9,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,117732737.0,906419405,2014-12-11,...,,,NaT,,,160633651,51,21214,1.0,2014-12-11


# Join drugs

In [111]:
conn = duckdb.connect(database=db)

# Create drug data join with 2014 filter
create_table_query = """
CREATE OR REPLACE TABLE join_2014_drugs AS
SELECT 
    ins.pid,
    ins.insurants_year_of_birth,
    ins.insurants_gender,
    insd.insurance_data_from,
    insd.insurance_data_to,
    insd.insurance_data_death,
    insd.insurance_data_regional_code,
    d.drugs_date_of_prescription,
    d.drugs_date_of_dispense,
    d.drugs_pharma_central_number,
    d.drugs_specialty_of_prescriber,
    d.drugs_physician_code,
    d.drugs_practice_code,
    d.drugs_quantity,
    d.drugs_amount_due,
    d.drugs_atc,
    d.drugs_ddd
FROM 
    insurants ins
JOIN 
    insurance_data insd ON ins.pid = insd.pid
    AND EXTRACT(YEAR FROM insd.insurance_data_from) <= 2014 
    AND (insd.insurance_data_to IS NULL OR EXTRACT(YEAR FROM insd.insurance_data_to) >= 2014)
LEFT JOIN 
    drugs d ON ins.pid = d.pid
    AND d.drugs_date_of_dispense BETWEEN insd.insurance_data_from AND insd.insurance_data_to
ORDER BY 
    ins.pid, 
    d.drugs_date_of_dispense
"""

conn.execute(create_table_query)

# Get comprehensive statistics about the drugs join
statistics_query = """
SELECT
    COUNT(*) as total_rows,
    COUNT(DISTINCT pid) as unique_patients,
    SUM(CASE WHEN drugs_pharma_central_number IS NOT NULL THEN 1 ELSE 0 END) as rows_with_drugs,
    COUNT(DISTINCT CASE WHEN drugs_pharma_central_number IS NOT NULL THEN pid END) as patients_with_drugs,
    COUNT(DISTINCT drugs_pharma_central_number) as unique_drug_codes,
    COUNT(DISTINCT drugs_atc) as unique_atc_codes,
    AVG(drugs_quantity) as avg_drug_quantity,
    AVG(drugs_amount_due) as avg_drug_cost,
    MIN(drugs_date_of_dispense) as earliest_dispense_date,
    MAX(drugs_date_of_dispense) as latest_dispense_date,
    COUNT(DISTINCT drugs_physician_code) as unique_prescribers,
    COUNT(DISTINCT drugs_practice_code) as unique_practices
FROM
    join_2014_drugs
"""

conn = duckdb.connect(database=db)
stats = conn.execute(statistics_query).fetchdf()
display(stats)

# Show a sample of the data
sample_query = """
SELECT * FROM join_2014_drugs
LIMIT 10
"""
sample_df = conn.execute(sample_query).fetchdf()
display(sample_df)

conn.close()

Unnamed: 0,total_rows,unique_patients,rows_with_drugs,patients_with_drugs,unique_drug_codes,unique_atc_codes,avg_drug_quantity,avg_drug_cost,earliest_dispense_date,latest_dispense_date,unique_prescribers,unique_practices
0,111757,6448,109921.0,5922,11333,1166,1.121108,83.375746,2014-01-01,2014-12-31,13926,11630


Unnamed: 0,pid,insurants_year_of_birth,insurants_gender,insurance_data_from,insurance_data_to,insurance_data_death,insurance_data_regional_code,drugs_date_of_prescription,drugs_date_of_dispense,drugs_pharma_central_number,drugs_specialty_of_prescriber,drugs_physician_code,drugs_practice_code,drugs_quantity,drugs_amount_due,drugs_atc,drugs_ddd
0,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,2014-01-06,2014-01-06,2012935,2,868524702,403313187,1.0,18.87,N05BA09,25.0
1,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,2014-01-06,2014-01-06,1218132,2,868524702,403313187,1.0,197.34,N03AX14,33.333
2,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,2014-01-28,2014-01-28,1897570,2,868524702,403313187,1.0,166.61,N03AX14,33.333
3,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,2014-02-03,2014-02-03,4939292,21,267199921,77644261,1.0,13.85,D07AC14,15.0
4,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,2014-02-03,2014-02-03,6313409,2,868524702,403313187,1.0,14.6,M01AE01,25.0
5,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,2014-02-03,2014-02-03,4578374,21,267199921,77644261,1.0,21.78,D07AC13,46.773
6,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,2014-02-03,2014-02-03,2634223,2,868524702,403313187,1.0,13.98,C08CA01,100.0
7,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,2014-02-20,2014-02-20,2012935,2,868524702,403313187,1.0,18.87,N05BA09,25.0
8,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,2014-02-20,2014-02-20,1218132,2,868524702,403313187,1.0,197.34,N03AX14,33.333
9,182.0,1961.0,1,2014-01-01,2014-12-31,0,2,2014-02-20,2014-02-20,4220336,2,868524702,403313187,1.0,28.73,N06AB04,100.0


In [106]:
conn.close()