In [1]:
# Merging everything with Dask

In [1]:
import dask.dataframe as dd
import pandas as pd

# Function to rename columns with prefix and formatting
def rename_columns(df, prefix, exceptions=None):
    """
    Renames the columns of a DataFrame by adding a prefix, replacing spaces with underscores, and converting to lowercase.

    Parameters:q
        df (DataFrame): The input DataFrame.
        prefix (str): Prefix to add to column names.
        exceptions (list, optional): List of column names to exclude from renaming.

    Returns:
        DataFrame: The updated DataFrame with renamed columns.
    """
    if exceptions is None:
        exceptions = []

    df = df.rename(columns=lambda x: f"{prefix}_{x.strip().replace(' ', '_').lower()}" if x not in exceptions else x)
    return df

# Define base path for the input CSV files
base_path = 'testdata'

# Define file paths for the input CSV files
file_paths = {
    'insurance_data': f'{base_path}/test.insurance_data.csv',
    'insurants': f'{base_path}/test.insurants.csv',
    'inpatient_cases': f'{base_path}/test.inpatient_cases.csv',
    'inpatient_diagnosis': f'{base_path}/test.inpatient_diagnosis.csv',
    'inpatient_procedures': f'{base_path}/test.inpatient_procedures.csv',
    'inpatient_fees': f'{base_path}/test.inpatient_fees.csv',
    'outpatient_cases': f'{base_path}/test.outpatient_cases.csv',
    'outpatient_diagnosis': f'{base_path}/test.outpatient_diagnosis.csv',
    'outpatient_fees': f'{base_path}/test.outpatient_fees.csv',
    'outpatient_procedures': f'{base_path}/test.outpatient_procedures.csv',
    'drugs': f'{base_path}/test.drugs.csv'
}

# Define dtypes for each file
dtypes = {
    'insurance_data': {
        'pid': int,
        'death': 'Int64',
        'regional_code': 'Int64'
    },
    'insurants': {
        'pid': int,
        'year_of_birth': int,
        'gender': int,
    },
    'inpatient_cases': {
        'pid': int,
        'caseID': 'Int64',
        'cause of admission': 'str',
        'cause of discharge': 'str',
        'outpatient treatment': 'Int64',
        'department admission': str,
        'department discharge': str
    },
    'inpatient_diagnosis': {
        'pid': int,
        'caseID': 'Int64',
        'diagnosis': str,
        'type of diagnosis': str,
        'is main diagnosis': 'Int64',
        'localisation': 'Int64'
    },
    'inpatient_fees': {
        'pid': int,
        'caseID': 'Int64',
        'billing code': str,
        'amount due': float,
        'quantity': 'Int64'
    },
    'inpatient_procedures': {
        'pid': int,
        'caseID': 'Int64',
        'procedure code': str,
        'localisation': 'Int64',
    },
    'outpatient_cases': {
        'pid': int,
        'caseID': 'Int64',
        'practice code': str,
        'amount due': float,
        'year': 'Int64',
        'quarter': 'Int64'
    },
    'outpatient_diagnosis': {
        'pid': int,
        'caseID': 'Int64',
        'diagnosis': str,
        'qualification': str,
        'localisation': 'Int64'
    },
    'outpatient_fees': {
        'pid': int,
        'caseID': 'Int64',
        'physican code': str,
        'specialty code': str,
        'billing code': str,
        'quantity': 'Int64',
    },
    'outpatient_procedures': {
        'pid': int,
        'caseID': 'Int64',
        'procedure code': str,
        'localisation': 'Int64',
    },
    'drugs': {
        'pid': int,
        'pharma central number': str,
        'specialty of prescriber': str,
        'physican code': str,
        'practice code': str;
        'outpatient_diagnosis': str,
        'outpatient_diagnosis_qualification': str,
        'outpatient_diagnosis_localisation': 'Int64',
    }
}

parse_dates = {
    'insurance_data': ['from', 'to'],
    'inpatient_cases': ['date of admission', 'date of discharge'],
    'inpatient_fees': ['from', 'to'],
    'inpatient_procedures': ['date of procedure'],
    'outpatient_cases': ['from', 'to'],
    'outpatient_fees': ['date'],
    'drugs': ['date of prescription', 'date of dispense']
}

# Read and process each CSV
dataframes = {}
for table_name, file_path in file_paths.items():
    # Read CSV into Dask DataFrame
    df = dd.read_csv(
        file_path,
        sep='\t',
        dtype=dtypes.get(table_name, None),
        parse_dates=parse_dates.get(table_name, None)
    )

    # Rename columns
    df = rename_columns(df, prefix=table_name, exceptions=['pid', 'caseID'])

    # Store the processed DataFrame
    dataframes[table_name] = df

In [2]:
dataframes

{'insurance_data': Dask DataFrame Structure:
                  pid insurance_data_from insurance_data_to insurance_data_death insurance_data_regional_code
 npartitions=1                                                                                               
                int64      datetime64[ns]    datetime64[ns]                Int64                        Int64
                  ...                 ...               ...                  ...                          ...
 Dask Name: operation, 2 expressions
 Expr=RenameFrame(frame=ReadCSV(0f25b5d), columns=<function rename_columns.<locals>.<lambda> at 0x1513a66bcae0>),
 'insurants': Dask DataFrame Structure:
                  pid insurants_year_of_birth insurants_gender
 npartitions=1                                                
                int64                   int64            int64
                  ...                     ...              ...
 Dask Name: operation, 2 expressions
 Expr=RenameFrame(frame=ReadCSV(c18

In [3]:
dataframes['outpatient_cases']

Unnamed: 0_level_0,pid,caseID,outpatient_cases_practice_code,outpatient_cases_from,outpatient_cases_to,outpatient_cases_amount_due,outpatient_cases_year,outpatient_cases_quarter
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,int64,Int64,string,datetime64[ns],datetime64[ns],float64,Int64,Int64
,...,...,...,...,...,...,...,...


In [4]:
print(dataframes['outpatient_fees'].dtypes)

pid                                         int64
caseID                                      Int64
outpatient_fees_physican_code     string[pyarrow]
outpatient_fees_specialty_code    string[pyarrow]
outpatient_fees_billing_code      string[pyarrow]
outpatient_fees_quantity                    Int64
outpatient_fees_date               datetime64[ns]
dtype: object


In [5]:
df_result = dataframes['inpatient_cases'].compute()

In [6]:
df_result

Unnamed: 0,pid,caseID,inpatient_cases_date_of_admission,inpatient_cases_date_of_discharge,inpatient_cases_cause_of_admission,inpatient_cases_cause_of_discharge,inpatient_cases_outpatient_treatment,inpatient_cases_department_admission,inpatient_cases_department_discharge
0,6,5556089,2019-06-14,2019-06-14,,,1,,
1,6,1568766,2019-08-30,2019-08-30,,,1,,
2,6,2714453,2019-03-30,2019-03-30,,,1,,
3,6,364774,2018-10-27,2018-10-27,,,1,,
4,6,11633323,2018-07-06,2018-07-07,0101,06,0,0100,0100
...,...,...,...,...,...,...,...,...,...
721,986,9750153,2019-11-21,2019-11-21,,,1,,
722,986,9117719,2019-10-25,2019-10-26,0107,01,0,0100,0100
723,987,6300871,2019-01-10,2019-01-13,0101,01,0,3400,3400
724,995,370981,2018-12-29,2018-12-29,,,1,,


In [7]:
# Merge datasets step by step
# Example: Merging 'insurance_data' with 'insurants'
df_merged = dd.merge(dataframes['insurance_data'], dataframes['insurants'], on='pid', how='left')
df_merged = dd.merge(df_merged, dataframes['outpatient_cases'], on='pid', how='left')

In [8]:
# Perform computations, filtering, or additional processing as needed
# Example: Compute the result to avoid lazy evaluation
df_result = df_merged.compute()

In [9]:
df_result

Unnamed: 0,pid,insurance_data_from,insurance_data_to,insurance_data_death,insurance_data_regional_code,insurants_year_of_birth,insurants_gender,caseID,outpatient_cases_practice_code,outpatient_cases_from,outpatient_cases_to,outpatient_cases_amount_due,outpatient_cases_year,outpatient_cases_quarter
0,724,2019-01-01,2019-12-31,0,9,1985,1,113432713,687404132,2018-07-01,2018-09-13,113.259265,2018,3
1,724,2019-01-01,2019-12-31,0,9,1985,1,54227018,838242784,2018-07-01,2018-07-01,46.886284,2018,3
2,724,2019-01-01,2019-12-31,0,9,1985,1,73873110,838242784,2019-04-05,2019-04-12,58.886200,2019,2
3,183,2018-01-01,2018-12-31,0,5,1987,1,,,NaT,NaT,,,
4,831,2020-01-01,2020-12-31,0,16,1977,2,,,NaT,NaT,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54776,986,2019-01-01,2019-12-31,0,5,1989,1,133596529,722488739,2019-08-01,2019-08-10,136.975700,2019,3
54777,986,2019-01-01,2019-12-31,0,5,1989,1,85527165,922391990,2020-04-03,2020-04-27,49.116323,2020,2
54778,986,2019-01-01,2019-12-31,0,5,1989,1,1004537983,922391990,2020-07-01,2020-09-30,95.000000,2020,3
54779,986,2019-01-01,2019-12-31,0,5,1989,1,109579845,836390643,2020-04-01,2020-04-01,20.775885,2020,2


In [10]:
# Merge outpatient_diagnosis on both pid and caseID
df_merged = dd.merge(df_merged, dataframes['outpatient_diagnosis'], on=['pid', 'caseID'], how='left')

# Merge outpatient_procedures on both pid and caseID
df_merged = dd.merge(df_merged, dataframes['outpatient_procedures'], on=['pid', 'caseID'], how='left')

# Merge outpatient_fees on both pid and caseID
df_merged = dd.merge(df_merged, dataframes['outpatient_fees'], on=['pid', 'caseID'], how='left')

In [11]:
df_merged_outpatient = df_merged

In [12]:
df_result = df_merged.compute()
df_result

Unnamed: 0,pid,insurance_data_from,insurance_data_to,insurance_data_death,insurance_data_regional_code,insurants_year_of_birth,insurants_gender,caseID,outpatient_cases_practice_code,outpatient_cases_from,...,outpatient_diagnosis_qualification,outpatient_diagnosis_localisation,outpatient_procedures_procedure_code,outpatient_procedures_localisation,outpatient_procedures_date_of_procedure,outpatient_fees_physican_code,outpatient_fees_specialty_code,outpatient_fees_billing_code,outpatient_fees_quantity,outpatient_fees_date
0,724,2019-01-01,2019-12-31,0,9,1985,1,113432713,687404132,2018-07-01,...,G,,,,,979483231,31,13691R,1,2018-09-13
1,724,2019-01-01,2019-12-31,0,9,1985,1,113432713,687404132,2018-07-01,...,G,,,,,979483231,31,33042,1,2018-09-13
2,724,2019-01-01,2019-12-31,0,9,1985,1,113432713,687404132,2018-07-01,...,G,,,,,979483231,31,32413,1,2018-09-13
3,724,2019-01-01,2019-12-31,0,9,1985,1,113432713,687404132,2018-07-01,...,G,,,,,979483231,31,32030,1,2018-09-13
4,724,2019-01-01,2019-12-31,0,9,1985,1,113432713,687404132,2018-07-01,...,G,,,,,979483231,31,32443,1,2018-09-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586140,986,2019-01-01,2019-12-31,0,5,1989,1,1004537983,922391990,2020-07-01,...,G,,,,,,,H0000,1,2020-09-03
1586141,986,2019-01-01,2019-12-31,0,5,1989,1,1004537983,922391990,2020-07-01,...,G,,,,,,,H0000,1,2020-08-21
1586142,986,2019-01-01,2019-12-31,0,5,1989,1,1004537983,922391990,2020-07-01,...,G,,,,,,,H0000,1,2020-08-09
1586143,986,2019-01-01,2019-12-31,0,5,1989,1,109579845,836390643,2020-04-01,...,G,,,,,621898700,00,01212,1,2020-04-01


In [13]:
df_inpatient_cases = dataframes['inpatient_cases'].compute()
df_inpatient_cases

Unnamed: 0,pid,caseID,inpatient_cases_date_of_admission,inpatient_cases_date_of_discharge,inpatient_cases_cause_of_admission,inpatient_cases_cause_of_discharge,inpatient_cases_outpatient_treatment,inpatient_cases_department_admission,inpatient_cases_department_discharge
0,6,5556089,2019-06-14,2019-06-14,,,1,,
1,6,1568766,2019-08-30,2019-08-30,,,1,,
2,6,2714453,2019-03-30,2019-03-30,,,1,,
3,6,364774,2018-10-27,2018-10-27,,,1,,
4,6,11633323,2018-07-06,2018-07-07,0101,06,0,0100,0100
...,...,...,...,...,...,...,...,...,...
721,986,9750153,2019-11-21,2019-11-21,,,1,,
722,986,9117719,2019-10-25,2019-10-26,0107,01,0,0100,0100
723,987,6300871,2019-01-10,2019-01-13,0101,01,0,3400,3400
724,995,370981,2018-12-29,2018-12-29,,,1,,


In [14]:
# Merge inpatient_cases
df_merged = dd.merge(df_merged, dataframes['inpatient_cases'], on=['pid', 'caseID'], how='left')

# Merge inpatient_diagnosis on both pid and caseID
df_merged = dd.merge(df_merged, dataframes['inpatient_diagnosis'], on=['pid', 'caseID'], how='left')

# Merge inpatient_procedures on both pid and caseID
df_merged = dd.merge(df_merged, dataframes['inpatient_procedures'], on=['pid', 'caseID'], how='left')

# Merge inpatient_fees on both pid and caseID
df_merged = dd.merge(df_merged, dataframes['inpatient_fees'], on=['pid', 'caseID'], how='left')

# Merge drugs on pid only (no caseID in drugs)
df_merged = dd.merge(df_merged, dataframes['drugs'], on='pid', how='left')

In [15]:
df_result = df_merged.compute()
df_result

ValueError: Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.

+-------------------------+---------+----------+
| Column                  | Found   | Expected |
+-------------------------+---------+----------+
| physican code           | float64 | int64    |
| practice code           | float64 | int64    |
| specialty of prescriber | float64 | int64    |
+-------------------------+---------+----------+

Usually this is due to dask's dtype inference failing, and
*may* be fixed by specifying dtypes manually by adding:

dtype={'physican code': 'float64',
       'practice code': 'float64',
       'specialty of prescriber': 'float64'}

to the call to `read_csv`/`read_table`.

Alternatively, provide `assume_missing=True` to interpret
all unspecified integer columns as floats.