In [2]:
import pandas as pd

In [3]:
import pwd

pwd

<module 'pwd' (built-in)>

In [4]:
df_insurants = pd.read_csv('wig2/testdata/test.insurants.csv', sep='\t')
df_insurance_data = pd.read_csv('wig2/testdata/test.insurance_data.csv', sep='\t')

In [5]:
# def filter_by_timeframe(df, insurance_from_col, insurance_to_col, case_from_col, case_to_col):
#     """
#     Filters rows from the dataframe where the outpatient/inpatient case timeframe is outside the insurance timeframe.

#     Parameters:
#         df (pd.DataFrame): The dataframe to filter.
#         insurance_from_col (str): Column name for the insurance 'from' date.
#         insurance_to_col (str): Column name for the insurance 'to' date.
#         case_from_col (str): Column name for the case 'from' date.
#         case_to_col (str): Column name for the case 'to' date.

#     Returns:
#         pd.DataFrame: The filtered dataframe.
#     """
#     # Ensure columns are datetime for comparison
#     df[insurance_from_col] = pd.to_datetime(df[insurance_from_col])
#     df[insurance_to_col] = pd.to_datetime(df[insurance_to_col])
#     df[case_from_col] = pd.to_datetime(df[case_from_col])
#     df[case_to_col] = pd.to_datetime(df[case_to_col])
    
#     # Apply filter condition
#     filtered_df = df[
#         (df[case_from_col] >= df[insurance_from_col]) & 
#         (df[case_to_col] <= df[insurance_to_col])
#     ]
#     return filtered_df

In [6]:
def filter_by_timeframe(df, insurance_from_col, insurance_to_col, case_from_col, case_to_col):
    """
    Filters rows from the dataframe where the outpatient/inpatient case timeframe is outside the insurance timeframe,
    and merges the filtered rows back into the original dataframe, keeping rows with NaT values intact.

    Parameters:
        df (pd.DataFrame): The dataframe to filter.
        insurance_from_col (str): Column name for the insurance 'from' date.
        insurance_to_col (str): Column name for the insurance 'to' date.
        case_from_col (str): Column name for the case 'from' date.
        case_to_col (str): Column name for the case 'to' date.

    Returns:
        pd.DataFrame: The original dataframe with only valid filtered rows updated.
    """
    # Separate rows with non-NaT values in case_from_col and case_to_col
    df_without_na = df.dropna(subset=[case_from_col, case_to_col]).copy()  # Use .copy() to avoid SettingWithCopyWarning

    # Ensure columns are datetime for comparison
    df_without_na[insurance_from_col] = pd.to_datetime(df_without_na[insurance_from_col])
    df_without_na[insurance_to_col] = pd.to_datetime(df_without_na[insurance_to_col])
    df_without_na[case_from_col] = pd.to_datetime(df_without_na[case_from_col])
    df_without_na[case_to_col] = pd.to_datetime(df_without_na[case_to_col])
    
    # Apply filter condition
    filtered_df = df_without_na[
        (df_without_na[case_from_col] >= df_without_na[insurance_from_col]) & 
        (df_without_na[case_to_col] <= df_without_na[insurance_to_col])
    ]
    
    # Merge the filtered rows back into the original dataframe
    df_final = pd.concat([df[~df.index.isin(df_without_na.index)], filtered_df], ignore_index=True)
    
    return df_final


In [7]:
df_insurance_data

Unnamed: 0,pid,from,to,death,regional_code
0,724,2019-01-01,2019-12-31,0,9
1,183,2018-01-01,2018-12-31,0,5
2,831,2020-01-01,2020-12-31,0,16
3,111,2020-01-01,2020-12-31,0,5
4,269,2018-01-01,2018-09-01,1,5
...,...,...,...,...,...
3813,791,2018-01-01,2018-12-31,0,8
3814,83,2018-01-01,2018-03-30,0,8
3815,352,2020-07-20,2020-12-31,0,16
3816,986,2019-01-01,2019-12-31,0,5


# Insurants + Insurance Data

In [8]:
# Start with `df_insurants` as the base
df_all = df_insurants
# Incremental joins
# 2 out of 11: insurance, insurance_data
df_all_2 = pd.merge(df_all, df_insurance_data, on='pid', how='left')
df_all_2.rename(columns={"from": "insurance_from", "to": "insurance_to"}, inplace=True)

# Convert column types
df_all_2['pid'] = df_all_2['pid'].astype('Int64')
df_all_2['Year of birth'] = df_all_2['Year of birth'].astype('Int64')
df_all_2['gender'] = df_all_2['gender'].astype('Int64')
df_all_2['insurance_from'] = pd.to_datetime(df_all_2['insurance_from'])
df_all_2['insurance_to'] = pd.to_datetime(df_all_2['insurance_to'])
df_all_2['death'] = df_all_2['death'].astype('Int64')
df_all_2['regional_code'] = df_all_2['regional_code'].astype('Int64')
df_all_2

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code
0,1,1995,1,2020-11-01,2020-12-31,0,16
1,1,1995,1,2020-01-01,2020-10-31,0,16
2,1,1995,1,2019-01-01,2019-12-31,0,16
3,1,1995,1,2018-01-01,2018-12-31,0,16
4,2,1990,1,2018-10-01,2018-12-31,0,14
...,...,...,...,...,...,...,...
3846,999,1954,1,2020-01-01,2020-12-31,0,3
3847,999,1954,1,2018-01-01,2018-12-31,0,3
3848,1000,1970,1,2018-01-01,2018-12-31,0,8
3849,1000,1970,1,2019-01-01,2019-12-31,0,8


# Outpatient Cases

In [9]:
df_outpatient_cases = pd.read_csv('wig2/testdata/test.outpatient_cases.csv', sep='\t', dtype=str)
# df_outpatient_cases = pd.read_csv('wig2/testdata/test.outpatient_cases.csv', sep='\t')

df_outpatient_cases.rename(columns={"caseID": "outpatient_caseID","practice code": "outpatient_case_practice_code","from": "outpatient_case_from", 
                                    "to": "outpatient_case_to", "amount due": "outpatient_case_amout_due", "year": "outpatient_case_year",
                                    "quarter": "outpatient_case_quarter"}, inplace=True)

# Convert column types
df_outpatient_cases['pid'] = df_outpatient_cases['pid'].astype(int)
df_outpatient_cases['outpatient_caseID'] = df_outpatient_cases['outpatient_caseID'].astype('Int64')
df_outpatient_cases['outpatient_case_from'] = pd.to_datetime(df_outpatient_cases['outpatient_case_from'])
df_outpatient_cases['outpatient_case_to'] = pd.to_datetime(df_outpatient_cases['outpatient_case_to'])
df_outpatient_cases['outpatient_case_amout_due'] = df_outpatient_cases['outpatient_case_amout_due'].astype(float)
df_outpatient_cases['outpatient_case_year'] = df_outpatient_cases['outpatient_case_year'].astype('Int64')
df_outpatient_cases['outpatient_case_quarter'] = df_outpatient_cases['outpatient_case_quarter'].astype('Int64')

df_outpatient_cases

Unnamed: 0,pid,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,outpatient_case_to,outpatient_case_amout_due,outpatient_case_year,outpatient_case_quarter
0,72,76264259,720171600,2020-05-08,2020-05-08,27.381551,2020,2
1,763,21097556,823839918,2020-03-06,2020-03-06,45.813490,2020,1
2,189,166284976,786077570,2018-05-02,2018-06-20,178.931144,2018,2
3,782,154749062,105171228,2020-10-01,2020-12-05,61.119695,2020,4
4,254,147204667,514903431,2019-07-24,2019-07-24,71.476000,2019,3
...,...,...,...,...,...,...,...,...
14373,275,156219026,922806199,2018-10-07,2018-11-30,135.598197,2018,4
14374,98,70030426,358043182,2019-10-24,2019-10-24,30.537000,2019,4
14375,319,159568236,068123842,2019-07-31,2019-07-31,110.799100,2019,3
14376,405,197675847,750434802,2020-03-05,2020-03-05,39.314367,2020,1


In [10]:
df_outpatient_cases.loc[df_outpatient_cases['pid'] == 16]

Unnamed: 0,pid,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,outpatient_case_to,outpatient_case_amout_due,outpatient_case_year,outpatient_case_quarter


## Merge

In [11]:
# 3 out of 11: insurance, insurance_data, outpatient_cases
df_all_3 = pd.merge(df_all_2, df_outpatient_cases, on='pid', how='left')
df_all_3.shape

(54814, 14)

In [12]:
# Show the result of the cartesian product
df_all_3.loc[df_all_3['pid'] == 1]

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,outpatient_case_to,outpatient_case_amout_due,outpatient_case_year,outpatient_case_quarter
0,1,1995,1,2020-11-01,2020-12-31,0,16,4908567,658930651,2018-09-13,2018-09-13,30.075665,2018,3
1,1,1995,1,2020-01-01,2020-10-31,0,16,4908567,658930651,2018-09-13,2018-09-13,30.075665,2018,3
2,1,1995,1,2019-01-01,2019-12-31,0,16,4908567,658930651,2018-09-13,2018-09-13,30.075665,2018,3
3,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,2018-09-13,30.075665,2018,3


In [13]:
df_all_3 = filter_by_timeframe(
    df_all_3,
    insurance_from_col="insurance_from",
    insurance_to_col="insurance_to",
    case_from_col="outpatient_case_from",
    case_to_col="outpatient_case_to"
)

# Check the filtered dataframe
print(df_all_3.shape)

(14521, 14)


## Remove
- Removing patients if they have no outpatient case entry (to less information to really consider it in an evaluation)

In [14]:
# Delete Rows if outpatient case ID is NA
df_all_3.dropna(subset=['outpatient_caseID'], inplace=True)
df_all_3.shape

(14159, 14)

In [15]:
df_all_3.loc[df_all_3['pid'] == 1]

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,outpatient_case_to,outpatient_case_amout_due,outpatient_case_year,outpatient_case_quarter
362,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,2018-09-13,30.075665,2018,3


## SQL-Try

In [16]:
# from pandasql import sqldf

# # Define a reusable function for running SQL queries
# run_query = lambda query: sqldf(query, globals())

# query = """
# SELECT *
# FROM df_all_2 AS a
# LEFT JOIN df_outpatient_cases AS opc
# USING(pid)
# WHERE opc.outpatient_case_from >= a.insurance_from
#   AND opc.outpatient_case_to <= a.insurance_to
# """

# df_all_3_test = run_query(query)
# df_all_3_test.shape

# Outpatient fees

In [17]:
df_outpatient_fees = pd.read_csv(
    'wig2/testdata/test.outpatient_fees.csv',
    sep='\t',
    dtype={
        'pid': int,
        'caseID': 'Int64',
        'physican code': str,
        'specialty code': 'Int64',
        'billing code': str,
        'quantity': 'Int64'
    },
    parse_dates=['date']  # Specify columns to be parsed as dates
)

# Rename columns
df_outpatient_fees.rename(columns={
    "caseID": "outpatient_caseID",
    "physican code": "outpatient_fees_physician_code",
    "specialty code": "outpatient_fees_specialty_code",
    "billing code": "outpatient_fees_billing_code",
    "quantity": "outpatient_fees_quantity",
    "date": "outpatient_fees_date"
}, inplace=True)
df_outpatient_fees

Unnamed: 0,pid,outpatient_caseID,outpatient_fees_physician_code,outpatient_fees_specialty_code,outpatient_fees_billing_code,outpatient_fees_quantity,outpatient_fees_date
0,63,374239,337800201,1,03061,1,2019-09-21
1,63,374239,337800201,1,32001,1,2019-09-21
2,63,374239,337800201,1,03040,1,2019-09-21
3,63,374239,337800201,1,03230,1,2019-09-21
4,63,374239,337800201,1,03060,1,2019-09-21
...,...,...,...,...,...,...,...
68298,172,1052546836,,,H0000,1,2018-09-22
68299,172,1052546836,,,H0000,1,2018-07-07
68300,172,1052546836,,,HP2,1,2018-07-06
68301,172,1052546836,,,HP3,1,2018-07-06


In [18]:
# check if outpatient_fees_physician_code has the leading zeroes
df_outpatient_fees.loc[df_outpatient_fees['pid'] == 215]

Unnamed: 0,pid,outpatient_caseID,outpatient_fees_physician_code,outpatient_fees_specialty_code,outpatient_fees_billing_code,outpatient_fees_quantity,outpatient_fees_date
40,215,9182246,47921201,1,32001,1,2019-06-12
41,215,9182246,47921201,1,3003,1,2019-06-12
42,215,9182246,47921201,1,3230,1,2019-06-12
43,215,9182246,47921201,1,3040,1,2019-06-12
67103,215,205295870,47921201,1,3230,1,2020-07-17
67104,215,205295870,47921201,1,3230,1,2020-07-10
67105,215,205295870,47921201,1,3040,1,2020-07-10
67106,215,205295870,47921201,1,3003,1,2020-07-10
67107,215,205295870,47921201,1,32001,1,2020-07-10


## Merge

In [19]:
# 4 out of 11: insurance, insurance_data, outpatient_cases, outpatient_fees
df_all_4 = pd.merge(df_all_3, df_outpatient_fees, on=['pid', 'outpatient_caseID'], how='left')
df_all_4

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,outpatient_case_to,outpatient_case_amout_due,outpatient_case_year,outpatient_case_quarter,outpatient_fees_physician_code,outpatient_fees_specialty_code,outpatient_fees_billing_code,outpatient_fees_quantity,outpatient_fees_date
0,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,2018-09-13,30.075665,2018,3,610229105,5,06222,1,2018-09-13
1,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,2018-09-13,30.075665,2018,3,610229105,5,06220,1,2018-09-13
2,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,2018-09-13,30.075665,2018,3,610229105,5,06225E,1,2018-09-13
3,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,2018-09-13,30.075665,2018,3,610229105,5,06220T,1,2018-09-13
4,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,2018-09-13,30.075665,2018,3,610229105,5,06225T,1,2018-09-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67221,1000,1970,1,2020-01-01,2020-12-31,0,8,13347761,058209611,2020-11-08,2020-11-08,47.837807,2020,4,636052610,10,18220,1,2020-11-08
67222,1000,1970,1,2020-01-01,2020-12-31,0,8,13347761,058209611,2020-11-08,2020-11-08,47.837807,2020,4,636052610,10,32001,1,2020-11-08
67223,1000,1970,1,2020-01-01,2020-12-31,0,8,13347761,058209611,2020-11-08,2020-11-08,47.837807,2020,4,636052610,10,18222,1,2020-11-08
67224,1000,1970,1,2020-01-01,2020-12-31,0,8,13347761,058209611,2020-11-08,2020-11-08,47.837807,2020,4,636052610,10,30201,1,2020-11-08


## Filter
- liegen outpatient fees date zwischen der insurance from bis to?

In [20]:
df_all_4 = filter_by_timeframe(
    df_all_4,
    insurance_from_col="insurance_from",
    insurance_to_col="insurance_to",
    case_from_col="outpatient_fees_date",
    case_to_col="outpatient_fees_date"
)

# Check the filtered dataframe
print(df_all_4.shape)

(67225, 19)


In [21]:
# Patient 659 has 55 entries in fees
df_outpatient_fees.loc[df_outpatient_fees['pid'] == 659].shape

(55, 7)

In [22]:
# Before merging patient 659 has 9 rows
df_all_3.loc[df_all_3['pid'] == 659].shape

(9, 14)

In [23]:
# After merging everything patient 659 has 44 rows
df_all_4.loc[df_all_4['pid'] == 659].shape

(45, 19)

## SQL-Try
- Same results as above with merging and filtering afterwards, sql is a bit slower

In [25]:
# query_3_4 = """
# SELECT *
# FROM df_all_3 AS a
# LEFT JOIN df_outpatient_fees AS opf
# ON a.pid = opf.pid 
#   AND a.outpatient_caseID = opf.outpatient_caseID
# WHERE opf.outpatient_fees_date >= a.insurance_from
#   AND opf.outpatient_fees_date <= a.insurance_to
# """

# df_all_4_sql = run_query(query_3_4)
# df_all_4_sql.shape

# Outpatient Procedures

In [26]:
df_outpatient_procedures = pd.read_csv(
    'wig2/testdata/test.outpatient_procedures.csv',
    sep='\t',
    dtype={
        'pid': int,
        'caseID': int,
        'procedure code': str,
        'localisation': 'Int64'
    },
    parse_dates=['date of procedure']  # Specify the column to parse as dates
)

# Rename columns
df_outpatient_procedures.rename(columns={
    "caseID": "outpatient_caseID",
    "procedure code": "outpatient_procedure_code",
    "localisation": "outpatient_procedure_localisation",
    "date of procedure": "outpatient_procedure_date"
}, inplace=True)

df_outpatient_procedures

Unnamed: 0,pid,outpatient_caseID,outpatient_procedure_code,outpatient_procedure_localisation,outpatient_procedure_date
0,527,288793,5-142.2,1,2021-07-19
1,527,288793,5-142.2,2,2021-07-19
2,551,3765687,5-156.9,2,2020-10-21
3,393,4108489,5-385.70,1,2020-02-16
4,232,4654625,1-650.2,,2020-01-05
...,...,...,...,...,...
114,551,200377624,5-156.9,1,2021-06-14
115,551,200377624,5-156.9,2,2021-06-14
116,551,200804681,5-156.9,2,2020-02-10
117,351,202453658,5-780.6w,2,2020-10-06


In [27]:
df_outpatient_procedures.loc[df_outpatient_procedures['pid'] == 527]

Unnamed: 0,pid,outpatient_caseID,outpatient_procedure_code,outpatient_procedure_localisation,outpatient_procedure_date
0,527,288793,5-142.2,1,2021-07-19
1,527,288793,5-142.2,2,2021-07-19


## Mergen

In [28]:
# 5 out of 11: insurance, insurance_data, outpatient_cases, outpatient_fees, outpatient_procedure
df_all_5 = pd.merge(df_all_4, df_outpatient_procedures, on=['pid', 'outpatient_caseID'], how='left')
df_all_5

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,...,outpatient_case_year,outpatient_case_quarter,outpatient_fees_physician_code,outpatient_fees_specialty_code,outpatient_fees_billing_code,outpatient_fees_quantity,outpatient_fees_date,outpatient_procedure_code,outpatient_procedure_localisation,outpatient_procedure_date
0,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,...,2018,3,610229105,5,06222,1,2018-09-13,,,NaT
1,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,...,2018,3,610229105,5,06220,1,2018-09-13,,,NaT
2,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,...,2018,3,610229105,5,06225E,1,2018-09-13,,,NaT
3,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,...,2018,3,610229105,5,06220T,1,2018-09-13,,,NaT
4,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,...,2018,3,610229105,5,06225T,1,2018-09-13,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67313,1000,1970,1,2020-01-01,2020-12-31,0,8,13347761,058209611,2020-11-08,...,2020,4,636052610,10,18220,1,2020-11-08,,,NaT
67314,1000,1970,1,2020-01-01,2020-12-31,0,8,13347761,058209611,2020-11-08,...,2020,4,636052610,10,32001,1,2020-11-08,,,NaT
67315,1000,1970,1,2020-01-01,2020-12-31,0,8,13347761,058209611,2020-11-08,...,2020,4,636052610,10,18222,1,2020-11-08,,,NaT
67316,1000,1970,1,2020-01-01,2020-12-31,0,8,13347761,058209611,2020-11-08,...,2020,4,636052610,10,30201,1,2020-11-08,,,NaT


## Rausfiltern
- Check if outpatient procedure ist in der insurance timeframe

In [29]:
df_all_5 = filter_by_timeframe(
    df_all_5,
    insurance_from_col="insurance_from",
    insurance_to_col="insurance_to",
    case_from_col="outpatient_procedure_date",
    case_to_col="outpatient_procedure_date"
)

# Check the filtered dataframe
print(df_all_5.shape)

(66748, 22)


In [30]:
# Example for insurant with pid= 527 we already have 140 rows
df_all_5.loc[df_all_5['pid'] == 527].shape

(140, 22)

# Outpatient diagnosis

In [31]:
df_outpatient_diagnosis = pd.read_csv(
    'wig2/testdata/test.outpatient_diagnosis.csv',
    sep='\t',
    dtype={
        'pid': 'int',
        'caseID': int,
        'diagnosis': str,
        'qualification': str,
        'localisation': 'Int64'
    }
)

# Rename columns
df_outpatient_diagnosis.rename(columns={
    "caseID": "outpatient_caseID",
    "diagnosis": "outpatient_diagnosis",
    "qualification": "outpatient_diagnosis_qualification",
    "localisation": "outpatient_diagnosis_localisation"
}, inplace=True)

print(df_outpatient_diagnosis.shape)

(88417, 5)


In [32]:
a = df_outpatient_diagnosis['pid'].unique()
print(sorted(a))
print("numpy size of unique patients: " + str(a.size))

[np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(17), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(37), np.int64(38), np.int64(40), np.int64(41), np.int64(42), np.int64(44), np.int64(45), np.int64(46), np.int64(47), np.int64(48), np.int64(49), np.int64(50), np.int64(51), np.int64(52), np.int64(53), np.int64(55), np.int64(56), np.int64(58), np.int64(59), np.int64(60), np.int64(61), np.int64(62), np.int64(63), np.int64(64), np.int64(65), np.int64(66), np.int64(67), np.int64(68), np.int64(69), np.int64(70), np.int64(71), np.int64(72), np.int64(74), np.int64(75), np.int64(76), np.int64(77), np.int64(78), np.int64(79), np.int64(80), 

- theoretically there are just 892 patients that had diagnosis
- but in the end they had over 88.000 procedures combined? insane

In [33]:
# Count the number of rows for each pid
df_outpatient_diagnosis['pid'].value_counts()[:5]

pid
300    433
128    423
378    391
485    386
894    370
Name: count, dtype: int64

In [34]:
# Check example diagnosis of insurant with pid=1
df_outpatient_diagnosis.loc[df_outpatient_diagnosis['pid'] == 300]

Unnamed: 0,pid,outpatient_caseID,outpatient_diagnosis,outpatient_diagnosis_qualification,outpatient_diagnosis_localisation
1860,300,11357409,N184,G,
1861,300,11357409,M750,G,1
1862,300,11357409,M542,G,
1863,300,11357409,M509,G,
1864,300,11357409,M501,G,2
...,...,...,...,...,...
88403,300,1050241566,M542,G,
88404,300,1050241566,M509,G,
88405,300,1050241566,M501,G,2
88406,300,1050241566,M4809,G,


## Merge

In [35]:
# 6 out of 11: insurance, insurance_data, outpatient_cases, outpatient_fees, outpatient_procedure, outpatient_diagnosis
df_all_6 = pd.merge(df_all_5, df_outpatient_diagnosis, on=['pid', 'outpatient_caseID'], how='left')
df_all_6

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,...,outpatient_fees_specialty_code,outpatient_fees_billing_code,outpatient_fees_quantity,outpatient_fees_date,outpatient_procedure_code,outpatient_procedure_localisation,outpatient_procedure_date,outpatient_diagnosis,outpatient_diagnosis_qualification,outpatient_diagnosis_localisation
0,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,...,5,06222,1,2018-09-13,,,NaT,M350,G,3
1,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,...,5,06222,1,2018-09-13,,,NaT,H522,G,3
2,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,...,5,06222,1,2018-09-13,,,NaT,H521,G,3
3,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,...,5,06222,1,2018-09-13,,,NaT,H311,G,3
4,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,...,5,06222,1,2018-09-13,,,NaT,H193,G,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413989,1000,1970,1,2020-01-01,2020-12-31,0,8,13347761,058209611,2020-11-08,...,10,18211,1,2020-11-08,,,NaT,Q667,G,3
413990,1000,1970,1,2020-01-01,2020-12-31,0,8,13347761,058209611,2020-11-08,...,10,18211,1,2020-11-08,,,NaT,M9983,G,
413991,1000,1970,1,2020-01-01,2020-12-31,0,8,13347761,058209611,2020-11-08,...,10,18211,1,2020-11-08,,,NaT,M773,G,3
413992,1000,1970,1,2020-01-01,2020-12-31,0,8,13347761,058209611,2020-11-08,...,10,18211,1,2020-11-08,,,NaT,M5416,G,


In [36]:
# Example for insurant with pid= 527 we already have 932 rows, after joining 6 tables
df_all_6.loc[df_all_6['pid'] == 527].shape

(932, 25)

In [37]:
# df_all_6.to_csv("df_outpatient_6.csv", sep=',', index=True, encoding='utf-8')

# Drugs

In [38]:
df_drugs = pd.read_csv(
    'wig2/testdata/test.drugs.csv',
    sep='\t',
    dtype={
        'pid': int,
        'pharma central number': str,
        'outpatient_diagnosis': str,
        'outpatient_diagnosis_qualification': str,
        'outpatient_diagnosis_localisation': 'Int64'
    },
    parse_dates=['date of prescription', 'date of dispense']  # Convert date columns
)

# Add prefix and clean column names
df_drugs = df_drugs.add_prefix('drugs_')
df_drugs.columns = df_drugs.columns.str.replace(' ', '_')

df_drugs

Unnamed: 0,drugs_pid,drugs_date_of_prescription,drugs_date_of_dispense,drugs_pharma_central_number,drugs_specialty_of_prescriber,drugs_physican_code,drugs_practice_code,drugs_quantity,drugs_amount_due,drugs_atc,drugs_ddd
0,542,2020-04-24,2020-04-26,06444040,52.0,11441752.0,462545519.0,2.0,28.30,N02BB02,8.333
1,35,2019-06-21,2019-06-23,03507952,15.0,100231815.0,118275942.0,1.0,16.27,N02BB02,16.667
2,590,2020-11-27,2020-11-27,03507952,1.0,86391701.0,675905294.0,1.0,16.27,N02BB02,16.667
3,345,2020-02-01,2020-02-02,02754708,1.0,650812501.0,861471675.0,1.0,14.54,H03AA01,50.000
4,345,2020-04-19,2020-04-23,02754708,1.0,650812501.0,861471675.0,1.0,14.54,H03AA01,50.000
...,...,...,...,...,...,...,...,...,...,...,...
9394,384,2018-12-15,2018-12-20,04596863,0.0,621898700.0,14632358.0,1.0,31.56,P01BA02,30.039
9395,814,2018-12-14,2018-12-20,04596863,31.0,768128631.0,794650148.0,1.0,31.56,P01BA02,30.039
9396,581,2018-04-07,2018-04-07,04596863,1.0,363816801.0,893595739.0,1.0,31.56,P01BA02,30.039
9397,581,2018-07-08,2018-07-18,04596863,1.0,363816801.0,893595739.0,1.0,31.56,P01BA02,30.039


## Merge

In [39]:
# 7 out of 11: insurance, insurance_data, outpatient_cases, outpatient_fees, outpatient_procedure, outpatient_diagnosis, drugs
df_all_7 = pd.merge(df_all_6, df_drugs, left_on='pid', right_on='drugs_pid', how='left')
df_all_7

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,...,drugs_date_of_prescription,drugs_date_of_dispense,drugs_pharma_central_number,drugs_specialty_of_prescriber,drugs_physican_code,drugs_practice_code,drugs_quantity,drugs_amount_due,drugs_atc,drugs_ddd
0,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,...,2020-12-13,2020-12-14,07387887,1.0,999393101.0,793529553.0,1.0,12.42,N02BB02,3.333
1,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,...,2020-12-13,2020-12-14,07387887,1.0,999393101.0,793529553.0,1.0,12.42,N02BB02,3.333
2,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,...,2020-12-13,2020-12-14,07387887,1.0,999393101.0,793529553.0,1.0,12.42,N02BB02,3.333
3,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,...,2020-12-13,2020-12-14,07387887,1.0,999393101.0,793529553.0,1.0,12.42,N02BB02,3.333
4,1,1995,1,2018-01-01,2018-12-31,0,16,4908567,658930651,2018-09-13,...,2020-12-13,2020-12-14,07387887,1.0,999393101.0,793529553.0,1.0,12.42,N02BB02,3.333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6300823,1000,1970,1,2020-01-01,2020-12-31,0,8,13347761,058209611,2020-11-08,...,2020-10-29,2020-11-05,06912972,1.0,102554301.0,176657311.0,1.0,16.96,H03AA01,116.667
6300824,1000,1970,1,2020-01-01,2020-12-31,0,8,13347761,058209611,2020-11-08,...,2019-12-07,2019-12-07,06912972,1.0,102554301.0,176657311.0,1.0,16.96,H03AA01,116.667
6300825,1000,1970,1,2020-01-01,2020-12-31,0,8,13347761,058209611,2020-11-08,...,2019-07-17,2019-07-18,11368832,15.0,147522015.0,408591171.0,1.0,12.76,A02BC02,28.000
6300826,1000,1970,1,2020-01-01,2020-12-31,0,8,13347761,058209611,2020-11-08,...,2018-06-03,2018-06-03,00938982,3.0,194276003.0,817162924.0,1.0,12.03,A02BC02,14.000


In [40]:
list(df_all_7)

['pid',
 'Year of birth',
 'gender',
 'insurance_from',
 'insurance_to',
 'death',
 'regional_code',
 'outpatient_caseID',
 'outpatient_case_practice_code',
 'outpatient_case_from',
 'outpatient_case_to',
 'outpatient_case_amout_due',
 'outpatient_case_year',
 'outpatient_case_quarter',
 'outpatient_fees_physician_code',
 'outpatient_fees_specialty_code',
 'outpatient_fees_billing_code',
 'outpatient_fees_quantity',
 'outpatient_fees_date',
 'outpatient_procedure_code',
 'outpatient_procedure_localisation',
 'outpatient_procedure_date',
 'outpatient_diagnosis',
 'outpatient_diagnosis_qualification',
 'outpatient_diagnosis_localisation',
 'drugs_pid',
 'drugs_date_of_prescription',
 'drugs_date_of_dispense',
 'drugs_pharma_central_number',
 'drugs_specialty_of_prescriber',
 'drugs_physican_code',
 'drugs_practice_code',
 'drugs_quantity',
 'drugs_amount_due',
 'drugs_atc',
 'drugs_ddd']

# Inpatient Cases

In [41]:
df_inpatient_cases = pd.read_csv(
    'wig2/testdata/test.inpatient_cases.csv',
    sep='\t',
    dtype={
        'pid': int,
        'caseID': int,
        'cause of admission': str,
        'cause of discharge': str,
        'outpatient treatment': 'Int64',
        'department admission': str,
        'department discharge': str
    },
    parse_dates=['date of admission', 'date of discharge']  # Convert date columns
)

# Add prefix and clean column names
df_inpatient_cases = df_inpatient_cases.add_prefix('inpatient_case_')
df_inpatient_cases.columns = df_inpatient_cases.columns.str.replace(' ', '_')
df_inpatient_cases.rename(columns={"inpatient_case_caseID": "inpatient_caseID"}, inplace=True)

df_inpatient_cases


Unnamed: 0,inpatient_case_pid,inpatient_caseID,inpatient_case_date_of_admission,inpatient_case_date_of_discharge,inpatient_case_cause_of_admission,inpatient_case_cause_of_discharge,inpatient_case_outpatient_treatment,inpatient_case_department_admission,inpatient_case_department_discharge
0,6,5556089,2019-06-14,2019-06-14,,,1,,
1,6,1568766,2019-08-30,2019-08-30,,,1,,
2,6,2714453,2019-03-30,2019-03-30,,,1,,
3,6,364774,2018-10-27,2018-10-27,,,1,,
4,6,11633323,2018-07-06,2018-07-07,0101,06,0,0100,0100
...,...,...,...,...,...,...,...,...,...
721,986,9750153,2019-11-21,2019-11-21,,,1,,
722,986,9117719,2019-10-25,2019-10-26,0107,01,0,0100,0100
723,987,6300871,2019-01-10,2019-01-13,0101,01,0,3400,3400
724,995,370981,2018-12-29,2018-12-29,,,1,,


## Merge
- 8 out of 11: insurance, insurance_data, outpatient_cases, outpatient_fees, outpatient_procedure, outpatient_diagnosis, drugs, inpatient_case

In [42]:
df_all_3_inpatient = pd.merge(df_all_2, df_inpatient_cases, left_on='pid', right_on='inpatient_case_pid', how='left')
df_all_3_inpatient

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,inpatient_case_pid,inpatient_caseID,inpatient_case_date_of_admission,inpatient_case_date_of_discharge,inpatient_case_cause_of_admission,inpatient_case_cause_of_discharge,inpatient_case_outpatient_treatment,inpatient_case_department_admission,inpatient_case_department_discharge
0,1,1995,1,2020-11-01,2020-12-31,0,16,,,NaT,NaT,,,,,
1,1,1995,1,2020-01-01,2020-10-31,0,16,,,NaT,NaT,,,,,
2,1,1995,1,2019-01-01,2019-12-31,0,16,,,NaT,NaT,,,,,
3,1,1995,1,2018-01-01,2018-12-31,0,16,,,NaT,NaT,,,,,
4,2,1990,1,2018-10-01,2018-12-31,0,14,,,NaT,NaT,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5760,999,1954,1,2020-01-01,2020-12-31,0,3,,,NaT,NaT,,,,,
5761,999,1954,1,2018-01-01,2018-12-31,0,3,,,NaT,NaT,,,,,
5762,1000,1970,1,2018-01-01,2018-12-31,0,8,,,NaT,NaT,,,,,
5763,1000,1970,1,2019-01-01,2019-12-31,0,8,,,NaT,NaT,,,,,


## Filter

In [43]:
df_all_3_inpatient = filter_by_timeframe(
    df_all_3_inpatient,
    insurance_from_col="insurance_from",
    insurance_to_col="insurance_to",
    case_from_col="inpatient_case_date_of_admission",
    case_to_col="inpatient_case_date_of_discharge"
)

df_all_3_inpatient.shape

(3798, 16)

In [44]:
# Dauert 1:34min, 2,3gb groß lol
# df_all_8.to_csv("df_all_8.csv", sep=',', index=True, encoding='utf-8')

## SQL-Try
- kernel crasht nicht nochmal machen lol

In [45]:
# from pandasql import sqldf

# df_inpatient_cases.rename(columns={"inpatient_case_pid": "pid"}, inplace=True)

# # Define a reusable function for running SQL queries
# run_query = lambda query: sqldf(query, globals())

# query = """
# SELECT *
# FROM df_all_7 AS a
# LEFT JOIN df_inpatient_cases AS ipc
# USING(pid)
# WHERE ipc.inpatient_case_date_of_admission >= a.insurance_from
#   AND ipc.inpatient_case_date_of_discharge <= a.insurance_to
# """

# df_all_8_sql = run_query(query)
# df_all_8_sql.shape

# Inpatient Diagnosis

In [46]:
df_inpatient_diagnosis = pd.read_csv(
    'wig2/testdata/test.inpatient_diagnosis.csv',
    sep='\t',
    dtype={
        'pid': int,
        'caseID': int,
        'diagnosis': str,
        'type of diagnosis': str,
        'is main diagnosis': 'Int64',
        'localisation': 'Int64'
    }
)

# Add prefix and clean column names
df_inpatient_diagnosis = df_inpatient_diagnosis.add_prefix('inpatient_diagnosis_')
df_inpatient_diagnosis.columns = df_inpatient_diagnosis.columns.str.replace(' ', '_')
df_inpatient_diagnosis.rename(columns={
    "inpatient_diagnosis_pid": "pid",
    "inpatient_diagnosis_caseID": "inpatient_caseID"
}, inplace=True)

df_inpatient_diagnosis


Unnamed: 0,pid,inpatient_caseID,inpatient_diagnosis_diagnosis,inpatient_diagnosis_type_of_diagnosis,inpatient_diagnosis_is_main_diagnosis,inpatient_diagnosis_Localisation
0,227,104770,Z019,02,1,
1,225,111066,M351,07,1,
2,225,111066,T887,08,0,
3,225,111066,R253,08,0,
4,225,111066,N028,08,0,
...,...,...,...,...,...,...
1481,936,12095459,M328,07,1,
1482,936,12095459,Z888,08,0,
1483,936,12095459,Z886,08,0,
1484,936,12095459,R768,08,0,


## Merge

In [47]:
# 9 out of 11: insurance, insurance_data, outpatient_cases, outpatient_fees, outpatient_procedure, outpatient_diagnosis, drugs, inpatient_case, inpatient_diagnosis
df_all_4_inpatient = pd.merge(df_all_3_inpatient, df_inpatient_diagnosis, on=['pid', 'inpatient_caseID'], how='left')
df_all_4_inpatient

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,inpatient_case_pid,inpatient_caseID,inpatient_case_date_of_admission,inpatient_case_date_of_discharge,inpatient_case_cause_of_admission,inpatient_case_cause_of_discharge,inpatient_case_outpatient_treatment,inpatient_case_department_admission,inpatient_case_department_discharge,inpatient_diagnosis_diagnosis,inpatient_diagnosis_type_of_diagnosis,inpatient_diagnosis_is_main_diagnosis,inpatient_diagnosis_Localisation
0,1,1995,1,2020-11-01,2020-12-31,0,16,,,NaT,NaT,,,,,,,,,
1,1,1995,1,2020-01-01,2020-10-31,0,16,,,NaT,NaT,,,,,,,,,
2,1,1995,1,2019-01-01,2019-12-31,0,16,,,NaT,NaT,,,,,,,,,
3,1,1995,1,2018-01-01,2018-12-31,0,16,,,NaT,NaT,,,,,,,,,
4,2,1990,1,2018-10-01,2018-12-31,0,14,,,NaT,NaT,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4549,995,1970,2,2020-06-06,2020-12-31,0,5,995.0,1564166.0,2020-07-23,2020-08-21,0101,01,0,2900,2900,F332,04,0,
4550,995,1970,2,2020-06-06,2020-12-31,0,5,995.0,1564166.0,2020-07-23,2020-08-21,0101,01,0,2900,2900,F332,11,0,
4551,995,1970,2,2020-06-06,2020-12-31,0,5,995.0,1564166.0,2020-07-23,2020-08-21,0101,01,0,2900,2900,F107,04,0,
4552,995,1970,2,2018-01-01,2018-12-31,0,5,995.0,370981.0,2018-12-29,2018-12-29,,,1,,,F331,02,1,


# Inpatient Fees

In [48]:
df_inpatient_fees = pd.read_csv(
    'wig2/testdata/test.inpatient_fees.csv',
    sep='\t',
    dtype={
        'pid': int,
        'caseID': int,
        'billing code': str,
        'amount due': 'float',
        'quantity': 'Int64'
    },
    parse_dates=['from', 'to']  # Convert date columns
)

# Add prefix and clean column names
df_inpatient_fees = df_inpatient_fees.add_prefix('inpatient_fees_')
df_inpatient_fees.columns = df_inpatient_fees.columns.str.replace(' ', '_')
df_inpatient_fees.rename(columns={
    "inpatient_fees_pid": "pid",
    "inpatient_fees_caseID": "inpatient_caseID"
}, inplace=True)

df_inpatient_fees


Unnamed: 0,pid,inpatient_caseID,inpatient_fees_from,inpatient_fees_to,inpatient_fees_billing_code,inpatient_fees_amount_due,inpatient_fees_quantity
0,542,1966155,2020-10-27,2020-10-27,21000000,299.99,1
1,867,6280672,2019-09-01,2019-09-01,21000010,4.55,1
2,867,6280672,2019-09-01,2019-09-01,21000000,103.30,1
3,987,6300871,2019-01-10,2019-01-12,75105002,84.14,1
4,987,6300871,2019-01-10,2019-01-12,7010J61C,2311.12,1
...,...,...,...,...,...,...,...
1341,343,11281843,2019-10-02,2019-10-02,75105002,136.66,1
1342,343,11281843,2019-10-02,2019-10-02,7310Z64D,-449.57,1
1343,343,11281843,2019-10-02,2019-10-02,7010Z64D,1321.87,1
1344,343,11281843,2019-10-02,2019-10-02,49120001,10.76,1


## Merge
- 10 out of 11: insurance, insurance_data, outpatient_cases, outpatient_fees, outpatient_procedure, outpatient_diagnosis, drugs, inpatient_case, inpatient_diagnosis, inpatient_fees

In [49]:
df_all_5_inpatient = pd.merge(df_all_4_inpatient, df_inpatient_fees, on=['pid', 'inpatient_caseID'], how='left')
df_all_5_inpatient.shape

(7081, 25)

## Filter

In [50]:
df_all_5_inpatient = filter_by_timeframe(
    df_all_5_inpatient,
    insurance_from_col="insurance_from",
    insurance_to_col="insurance_to",
    case_from_col="inpatient_fees_from",
    case_to_col="inpatient_fees_to"
)

df_all_5_inpatient.shape

(7078, 25)

# Inpatient Procedures

In [51]:
df_inpatient_procedures = pd.read_csv(
    'wig2/testdata/test.inpatient_procedures.csv',
    sep='\t',
    dtype={
        'pid': int,
        'caseID': int,
        'procedure code': str,
        'localisation': 'Int64'
    },
    parse_dates=['date of procedure']  # Convert the date column
)

# Add prefix and clean column names
df_inpatient_procedures = df_inpatient_procedures.add_prefix('inpatient_procedures_')
df_inpatient_procedures.columns = df_inpatient_procedures.columns.str.replace(' ', '_')
df_inpatient_procedures.rename(columns={
    "inpatient_procedures_pid": "pid",
    "inpatient_procedures_caseID": "inpatient_caseID",
    "inpatient_procedures_date_of_procedure": "inpatient_date_of_procedure"
}, inplace=True)
df_inpatient_procedures

Unnamed: 0,pid,inpatient_caseID,inpatient_procedures_procedure_code,inpatient_procedures_localisation,inpatient_date_of_procedure
0,225,111066,8-824,9,2018-01-27
1,225,111066,8-824,9,2018-01-28
2,225,111066,6-002.f3,9,2018-01-26
3,233,174381,8-810.w3,9,2019-01-19
4,233,174381,3-990,9,2019-01-20
...,...,...,...,...,...
339,619,11851394,1-465.0,2,2019-03-15
340,782,11962177,8-542.11,9,2019-10-18
341,936,12095459,9-411.04,9,2020-01-23
342,936,12095459,9-410.05,9,2020-01-25


## Merge
- 6 out of 11: insurance, insurance_data inpatient_case, inpatient_diagnosis, inpatient_fees, inpatient_procedures

In [52]:

df_all_6_inpatient = pd.merge(df_all_5_inpatient, df_inpatient_procedures, on=['pid', 'inpatient_caseID'], how='left')
df_all_6_inpatient

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,inpatient_case_pid,inpatient_caseID,inpatient_case_date_of_admission,...,inpatient_diagnosis_is_main_diagnosis,inpatient_diagnosis_Localisation,inpatient_fees_from,inpatient_fees_to,inpatient_fees_billing_code,inpatient_fees_amount_due,inpatient_fees_quantity,inpatient_procedures_procedure_code,inpatient_procedures_localisation,inpatient_date_of_procedure
0,1,1995,1,2020-11-01,2020-12-31,0,16,,,NaT,...,,,NaT,NaT,,,,,,NaT
1,1,1995,1,2020-01-01,2020-10-31,0,16,,,NaT,...,,,NaT,NaT,,,,,,NaT
2,1,1995,1,2019-01-01,2019-12-31,0,16,,,NaT,...,,,NaT,NaT,,,,,,NaT
3,1,1995,1,2018-01-01,2018-12-31,0,16,,,NaT,...,,,NaT,NaT,,,,,,NaT
4,2,1990,1,2018-10-01,2018-12-31,0,14,,,NaT,...,,,NaT,NaT,,,,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9940,995,1970,2,2020-06-06,2020-12-31,0,5,995.0,1564166.0,2020-07-23,...,0,,2020-07-23,2020-08-21,A6200003,0.91,1,9-649.81,9,2020-08-06
9941,995,1970,2,2020-06-06,2020-12-31,0,5,995.0,1564166.0,2020-07-23,...,0,,2020-07-23,2020-08-21,A6200003,0.91,1,9-649.80,9,2020-08-13
9942,995,1970,2,2020-06-06,2020-12-31,0,5,995.0,1564166.0,2020-07-23,...,0,,2020-07-23,2020-08-21,A6200000,87.86,1,9-980.0,9,2020-07-23
9943,995,1970,2,2020-06-06,2020-12-31,0,5,995.0,1564166.0,2020-07-23,...,0,,2020-07-23,2020-08-21,A6200000,87.86,1,9-649.81,9,2020-08-06


## Filter

In [53]:
df_all_6_inpatient = filter_by_timeframe(
    df_all_6_inpatient,
    insurance_from_col="insurance_from",
    insurance_to_col="insurance_to",
    case_from_col="inpatient_date_of_procedure",
    case_to_col="inpatient_date_of_procedure"
)

df_all_6_inpatient.shape

(9945, 28)

In [None]:
list(df_all_6_inpatient)

['pid',
 'Year of birth',
 'gender',
 'insurance_from',
 'insurance_to',
 'death',
 'regional_code',
 'inpatient_case_pid',
 'inpatient_caseID',
 'inpatient_case_date_of_admission',
 'inpatient_case_date_of_discharge',
 'inpatient_case_cause_of_admission',
 'inpatient_case_cause_of_discharge',
 'inpatient_case_outpatient_treatment',
 'inpatient_case_department_admission',
 'inpatient_case_department_discharge',
 'inpatient_diagnosis_diagnosis',
 'inpatient_diagnosis_type_of_diagnosis',
 'inpatient_diagnosis_is_main_diagnosis',
 'inpatient_diagnosis_Localisation',
 'inpatient_fees_from',
 'inpatient_fees_to',
 'inpatient_fees_billing_code',
 'inpatient_fees_amount_due',
 'inpatient_fees_quantity',
 'inpatient_procedures_procedure_code',
 'inpatient_procedures_localisation',
 'inpatient_date_of_procedure']

- Remove columns that are duplicated in the outpatient dataframe as well from insurant + insurance data

In [None]:
df_all_6_inpatient.drop(['Year of birth', 'gender', 'insurance_from', 'insurance_to', 'death', 'regional_code'], axis=1, inplace=True)

KeyError: "['Year of birth', 'gender', 'insurance_from', 'insurance_to', 'death', 'regional_code'] not found in axis"

In [57]:
list(df_all_6_inpatient)

['pid',
 'inpatient_case_pid',
 'inpatient_caseID',
 'inpatient_case_date_of_admission',
 'inpatient_case_date_of_discharge',
 'inpatient_case_cause_of_admission',
 'inpatient_case_cause_of_discharge',
 'inpatient_case_outpatient_treatment',
 'inpatient_case_department_admission',
 'inpatient_case_department_discharge',
 'inpatient_diagnosis_diagnosis',
 'inpatient_diagnosis_type_of_diagnosis',
 'inpatient_diagnosis_is_main_diagnosis',
 'inpatient_diagnosis_Localisation',
 'inpatient_fees_from',
 'inpatient_fees_to',
 'inpatient_fees_billing_code',
 'inpatient_fees_amount_due',
 'inpatient_fees_quantity',
 'inpatient_procedures_procedure_code',
 'inpatient_procedures_localisation',
 'inpatient_date_of_procedure']

# Chunkwise Merge

In [None]:
# import pandas as pd

# # Function to merge dataframes in chunks
# # def merge_in_chunks(df1, df2, merge_key, columns_to_keep, chunk_size=10000):
# def merge_in_chunks(df1, df2, merge_key, chunk_size=10000):
#     """
#     Merges two dataframes chunkwise to handle large datasets.

#     Parameters:
#         df1 (pd.DataFrame): The first dataframe.
#         df2 (pd.DataFrame): The second dataframe.
#         merge_key (str): The key to merge on (e.g., 'pid').
#         columns_to_keep (list): List of columns to keep in the final dataframe (e.g., deduplicated columns).
#         chunk_size (int): Number of rows per chunk.

#     Returns:
#         pd.DataFrame: The merged dataframe.
#     """
#     merged_results = []

#     # Iterate through df1 in chunks
#     for start_row in range(0, len(df1), chunk_size):
#         end_row = start_row + chunk_size
#         chunk = df1.iloc[start_row:end_row]

#         # Merge the chunk with df2
#         merged_chunk = chunk.merge(
#             df2,
#             on=merge_key,
#             how="left",  # Adjust join type if needed
#         )
        
#         # Keep only the specified columns
#         # merged_chunk = merged_chunk[columns_to_keep]
        
#         # Append the merged chunk to the results
#         merged_results.append(merged_chunk)

#     # Concatenate all chunks together
#     final_merged_df = pd.concat(merged_results, ignore_index=True)
#     return final_merged_df


In [None]:
# Columns to keep in the final dataframe
# columns_to_keep = [
#     'pid', 'Year of birth', 'gender', 'insurance_from', 'insurance_to', 'death', 'regional_code',
#     'inpatient_caseID', 'inpatient_case_date_of_admission', 'inpatient_case_date_of_discharge',
#     'inpatient_case_cause_of_admission', 'inpatient_case_cause_of_discharge', 'inpatient_case_outpatient_treatment',
#     'inpatient_case_department_admission', 'inpatient_case_department_discharge', 'inpatient_diagnosis_diagnosis',
#     'inpatient_diagnosis_type_of_diagnosis', 'inpatient_diagnosis_is_main_diagnosis', 'inpatient_diagnosis_Localisation',
#     'inpatient_fees_from', 'inpatient_fees_to', 'inpatient_fees_billing_code', 'inpatient_fees_amount_due',
#     'inpatient_fees_quantity', 'inpatient_procedures_procedure_code', 'inpatient_procedures_localisation',
#     'inpatient_date_of_procedure', 'outpatient_caseID', 'outpatient_case_practice_code', 'outpatient_case_from',
#     'outpatient_case_to', 'outpatient_case_amout_due', 'outpatient_case_year', 'outpatient_case_quarter',
#     'outpatient_fees_physician_code', 'outpatient_fees_specialty_code', 'outpatient_fees_billing_code',
#     'outpatient_fees_quantity', 'outpatient_fees_date', 'outpatient_procedure_code', 'outpatient_procedure_localisation',
#     'outpatient_procedure_date', 'outpatient_diagnosis', 'outpatient_diagnosis_qualification',
#     'outpatient_diagnosis_localisation', 'drugs_date_of_prescription', 'drugs_date_of_dispense',
#     'drugs_pharma_central_number', 'drugs_specialty_of_prescriber', 'drugs_physican_code', 'drugs_practice_code',
#     'drugs_quantity', 'drugs_amount_due', 'drugs_atc', 'drugs_ddd'
# ]

# Perform the chunkwise merge
# final_merged_df = merge_in_chunks(df_all_6_inpatient, df_all_7, merge_key="pid", columns_to_keep=columns_to_keep)
# final_merged_df = merge_in_chunks(df_all_6_inpatient, df_all_7, merge_key="pid")

# Save or inspect the resulting dataframe
# print(final_merged_df.shape)

: 

In [None]:
import os
import dask.dataframe as dd

def merge_and_save_with_size_limit(
    df1, df2, merge_key, columns_to_keep, output_path, max_size_mb, chunk_size=100000
):
    """
    Merges two large dataframes with a size limit on the output file.

    Parameters:
        df1 (pd.DataFrame): The first dataframe.
        df2 (pd.DataFrame): The second dataframe.
        merge_key (str): The key to merge on.
        columns_to_keep (list): List of columns to keep in the final dataframe.
        output_path (str): Path to save the output CSV file.
        max_size_mb (int): Maximum allowed size of the output file in MB.
        chunk_size (int): Number of rows to process per chunk.

    Returns:
        None
    """
    # Convert pandas to Dask dataframes
    df1_dask = dd.from_pandas(df1, npartitions=10)
    df2_dask = dd.from_pandas(df2, npartitions=10)

    # Merge the Dask dataframes
    merged_dask = dd.merge(df1_dask, df2_dask, on=merge_key, how="left")
    merged_dask = merged_dask[columns_to_keep]

    # Write output in chunks and monitor file size
    with open(output_path, "w") as f:
        # Write the header for the CSV
        merged_dask.head(0).to_csv(f, index=False)

    # Write chunks
    for i, partition in enumerate(merged_dask.to_delayed()):
        if os.path.exists(output_path) and os.path.getsize(output_path) > max_size_mb * 1024 * 1024:
            print(f"Output file size exceeded {max_size_mb} MB. Stopping...")
            break

        # Convert partition to pandas and append it to the file
        partition_df = partition.compute()
        partition_df.to_csv(output_path, mode="a", header=False, index=False)

    print(f"Merge completed. File saved to {output_path}")

# Define parameters
output_path = "merged_output.csv"
max_size_mb = 50000  # Maximum file size in MB
columns_to_keep = [
    'pid', 'Year of birth', 'gender', 'insurance_from', 'insurance_to', 'death', 'regional_code',
    'inpatient_caseID', 'inpatient_case_date_of_admission', 'inpatient_case_date_of_discharge',
    'inpatient_case_cause_of_admission', 'inpatient_case_cause_of_discharge', 'inpatient_case_outpatient_treatment',
    'inpatient_case_department_admission', 'inpatient_case_department_discharge', 'inpatient_diagnosis_diagnosis',
    'inpatient_diagnosis_type_of_diagnosis', 'inpatient_diagnosis_is_main_diagnosis', 'inpatient_diagnosis_Localisation',
    'inpatient_fees_from', 'inpatient_fees_to', 'inpatient_fees_billing_code', 'inpatient_fees_amount_due',
    'inpatient_fees_quantity', 'inpatient_procedures_procedure_code', 'inpatient_procedures_localisation',
    'inpatient_date_of_procedure', 'outpatient_caseID', 'outpatient_case_practice_code', 'outpatient_case_from',
    'outpatient_case_to', 'outpatient_case_amout_due', 'outpatient_case_year', 'outpatient_case_quarter',
    'outpatient_fees_physician_code', 'outpatient_fees_specialty_code', 'outpatient_fees_billing_code',
    'outpatient_fees_quantity', 'outpatient_fees_date', 'outpatient_procedure_code', 'outpatient_procedure_localisation',
    'outpatient_procedure_date', 'outpatient_diagnosis', 'outpatient_diagnosis_qualification',
    'outpatient_diagnosis_localisation', 'drugs_date_of_prescription', 'drugs_date_of_dispense',
    'drugs_pharma_central_number', 'drugs_specialty_of_prescriber', 'drugs_physican_code', 'drugs_practice_code',
    'drugs_quantity', 'drugs_amount_due', 'drugs_atc', 'drugs_ddd'
]

# Run the function
merge_and_save_with_size_limit(df_all_6_inpatient, df_all_7, "pid", columns_to_keep, output_path, max_size_mb)
