In [5]:
import pandas as pd

In [6]:
import pwd

pwd

<module 'pwd' (built-in)>

In [7]:
df_drugs = pd.read_csv('wig2/testdata/test.drugs.csv', sep='\t')
df_insurants = pd.read_csv('wig2/testdata/test.insurants.csv', sep='\t')
df_insurance_data = pd.read_csv('wig2/testdata/test.insurance_data.csv', sep='\t')
df_inpatient_cases = pd.read_csv('wig2/testdata/test.inpatient_cases.csv', sep='\t')
df_inpatient_diagnosis = pd.read_csv('wig2/testdata/test.inpatient_diagnosis.csv', sep='\t')
df_inpatient_fees = pd.read_csv('wig2/testdata/test.inpatient_fees.csv', sep='\t')
df_inpatient_procedures = pd.read_csv('wig2/testdata/test.inpatient_procedures.csv', sep='\t')
df_outpatient_cases = pd.read_csv('wig2/testdata/test.outpatient_cases.csv', sep='\t')
df_outpatient_diagnosis = pd.read_csv('wig2/testdata/test.outpatient_diagnosis.csv', sep='\t')
df_outpatient_fees = pd.read_csv('wig2/testdata/test.outpatient_fees.csv', sep='\t')
df_outpatient_procedures = pd.read_csv('wig2/testdata/test.outpatient_procedures.csv', sep='\t')


In [8]:
def filter_by_timeframe(df, insurance_from_col, insurance_to_col, case_from_col, case_to_col):
    """
    Filters rows from the dataframe where the outpatient/inpatient case timeframe is outside the insurance timeframe.

    Parameters:
        df (pd.DataFrame): The dataframe to filter.
        insurance_from_col (str): Column name for the insurance 'from' date.
        insurance_to_col (str): Column name for the insurance 'to' date.
        case_from_col (str): Column name for the case 'from' date.
        case_to_col (str): Column name for the case 'to' date.

    Returns:
        pd.DataFrame: The filtered dataframe.
    """
    # Ensure columns are datetime for comparison
    df[insurance_from_col] = pd.to_datetime(df[insurance_from_col])
    df[insurance_to_col] = pd.to_datetime(df[insurance_to_col])
    df[case_from_col] = pd.to_datetime(df[case_from_col])
    df[case_to_col] = pd.to_datetime(df[case_to_col])
    
    # Apply filter condition
    filtered_df = df[
        (df[case_from_col] >= df[insurance_from_col]) & 
        (df[case_to_col] <= df[insurance_to_col])
    ]
    return filtered_df

In [9]:
# Start with `df_insurants` as the base
df_universal = df_insurants
# Incremental joins
df_universal = pd.merge(df_universal, df_insurance_data, on='pid', how='left')

Insurants-Rows: 1000
Insurance_Data-Rows: 3818
After merging both:

In [10]:
df_universal.shape

(3851, 7)

In [11]:
# 2 out of 11: insurance, insurance_data
df_universal.rename(columns={"from": "insurance_from", "to": "insurance_to"}, inplace=True)

In [12]:
df_outpatient_cases.rename(columns={"caseID": "outpatient_caseID","practice code": "outpatient_case_practice_code","from": "outpatient_case_from", 
                                    "to": "outpatient_case_to", "amount due": "outpatient_case_amout_due", "year": "outpatient_case_year",
                                    "quarter": "outpatient_case_quarter"}, inplace=True)
df_outpatient_cases

Unnamed: 0,pid,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,outpatient_case_to,outpatient_case_amout_due,outpatient_case_year,outpatient_case_quarter
0,72,76264259,720171600.0,2020-05-08,2020-05-08,27.381551,2020,2
1,763,21097556,823839918.0,2020-03-06,2020-03-06,45.813490,2020,1
2,189,166284976,786077570.0,2018-05-02,2018-06-20,178.931144,2018,2
3,782,154749062,105171228.0,2020-10-01,2020-12-05,61.119695,2020,4
4,254,147204667,514903431.0,2019-07-24,2019-07-24,71.476000,2019,3
...,...,...,...,...,...,...,...,...
14373,275,156219026,922806199.0,2018-10-07,2018-11-30,135.598197,2018,4
14374,98,70030426,358043182.0,2019-10-24,2019-10-24,30.537000,2019,4
14375,319,159568236,68123842.0,2019-07-31,2019-07-31,110.799100,2019,3
14376,405,197675847,750434802.0,2020-03-05,2020-03-05,39.314367,2020,1


In [13]:
# 3 out of 11: insurance, insurance_data, outpatient_cases
df_universal = pd.merge(df_universal, df_outpatient_cases, on='pid', how='left')

In [14]:
df_universal.shape

(54814, 14)

In [15]:
# Show the result of the cartesian product
df_universal.loc[df_universal['pid'] == 1]

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,outpatient_case_to,outpatient_case_amout_due,outpatient_case_year,outpatient_case_quarter
0,1,1995,1,2020-11-01,2020-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,2018-09-13,30.075665,2018.0,3.0
1,1,1995,1,2020-01-01,2020-10-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,2018-09-13,30.075665,2018.0,3.0
2,1,1995,1,2019-01-01,2019-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,2018-09-13,30.075665,2018.0,3.0
3,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,2018-09-13,30.075665,2018.0,3.0


In [16]:
df_universal = filter_by_timeframe(
    df_universal,
    insurance_from_col="insurance_from",
    insurance_to_col="insurance_to",
    case_from_col="outpatient_case_from",
    case_to_col="outpatient_case_to"
)

# Check the filtered dataframe
print(df_universal.shape)

(14159, 14)


In [17]:
df_universal.loc[df_universal['pid'] == 1]

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,outpatient_case_to,outpatient_case_amout_due,outpatient_case_year,outpatient_case_quarter
3,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,2018-09-13,30.075665,2018.0,3.0


In [18]:
df_outpatient_fees = pd.read_csv('wig2/testdata/test.outpatient_fees.csv', sep='\t')
df_outpatient_fees.rename(columns={"caseID": "outpatient_caseID","physican code": "outpatient_fees_physician_code", "specialty code": "outpatient_fees_specialty_code",
                                   "billing code": "outpatient_fees_billing_code", "quantity": "outpatient_fees_quantity", "date": "outpatient_fees_date"}, inplace=True)
df_outpatient_fees

Unnamed: 0,pid,outpatient_caseID,outpatient_fees_physician_code,outpatient_fees_specialty_code,outpatient_fees_billing_code,outpatient_fees_quantity,outpatient_fees_date
0,63,374239,337800201.0,1.0,03061,1.0,2019-09-21
1,63,374239,337800201.0,1.0,32001,1.0,2019-09-21
2,63,374239,337800201.0,1.0,03040,1.0,2019-09-21
3,63,374239,337800201.0,1.0,03230,1.0,2019-09-21
4,63,374239,337800201.0,1.0,03060,1.0,2019-09-21
...,...,...,...,...,...,...,...
68298,172,1052546836,,,H0000,1.0,2018-09-22
68299,172,1052546836,,,H0000,1.0,2018-07-07
68300,172,1052546836,,,HP2,1.0,2018-07-06
68301,172,1052546836,,,HP3,1.0,2018-07-06


In [19]:
# 4 out of 11: insurance, insurance_data, outpatient_cases, outpatient_fees
df_universal = pd.merge(df_universal, df_outpatient_fees, on=['pid', 'outpatient_caseID'], how='left')

In [20]:
df_universal

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,outpatient_case_to,outpatient_case_amout_due,outpatient_case_year,outpatient_case_quarter,outpatient_fees_physician_code,outpatient_fees_specialty_code,outpatient_fees_billing_code,outpatient_fees_quantity,outpatient_fees_date
0,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,2018-09-13,30.075665,2018.0,3.0,610229105.0,5.0,06222,1.0,2018-09-13
1,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,2018-09-13,30.075665,2018.0,3.0,610229105.0,5.0,06220,1.0,2018-09-13
2,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,2018-09-13,30.075665,2018.0,3.0,610229105.0,5.0,06225E,1.0,2018-09-13
3,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,2018-09-13,30.075665,2018.0,3.0,610229105.0,5.0,06220T,1.0,2018-09-13
4,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,2018-09-13,30.075665,2018.0,3.0,610229105.0,5.0,06225T,1.0,2018-09-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67221,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,2020-11-08,47.837807,2020.0,4.0,636052610.0,10.0,18220,1.0,2020-11-08
67222,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,2020-11-08,47.837807,2020.0,4.0,636052610.0,10.0,32001,1.0,2020-11-08
67223,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,2020-11-08,47.837807,2020.0,4.0,636052610.0,10.0,18222,1.0,2020-11-08
67224,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,2020-11-08,47.837807,2020.0,4.0,636052610.0,10.0,30201,1.0,2020-11-08


In [21]:
df_outpatient_fees.loc[df_outpatient_fees['pid'] == 1]

Unnamed: 0,pid,outpatient_caseID,outpatient_fees_physician_code,outpatient_fees_specialty_code,outpatient_fees_billing_code,outpatient_fees_quantity,outpatient_fees_date
38058,1,4908567,610229105.0,5.0,06222,1.0,2018-09-13
38059,1,4908567,610229105.0,5.0,06220,1.0,2018-09-13
38060,1,4908567,610229105.0,5.0,06225E,1.0,2018-09-13
38061,1,4908567,610229105.0,5.0,06220T,1.0,2018-09-13
38062,1,4908567,610229105.0,5.0,06225T,1.0,2018-09-13


In [22]:
df_universal.loc[df_universal['pid'] == 1]

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,outpatient_case_to,outpatient_case_amout_due,outpatient_case_year,outpatient_case_quarter,outpatient_fees_physician_code,outpatient_fees_specialty_code,outpatient_fees_billing_code,outpatient_fees_quantity,outpatient_fees_date
0,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,2018-09-13,30.075665,2018.0,3.0,610229105.0,5.0,06222,1.0,2018-09-13
1,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,2018-09-13,30.075665,2018.0,3.0,610229105.0,5.0,06220,1.0,2018-09-13
2,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,2018-09-13,30.075665,2018.0,3.0,610229105.0,5.0,06225E,1.0,2018-09-13
3,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,2018-09-13,30.075665,2018.0,3.0,610229105.0,5.0,06220T,1.0,2018-09-13
4,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,2018-09-13,30.075665,2018.0,3.0,610229105.0,5.0,06225T,1.0,2018-09-13


In [23]:
df_outpatient_procedures = pd.read_csv('wig2/testdata/test.outpatient_procedures.csv', sep='\t')
df_outpatient_procedures.rename(columns={"caseID": "outpatient_caseID", "procedure code": "outpatient_procedure_code", "localisation": "outpatient_procedure_localisation",
                                         "date of procedure": "outpatient_procedure_date"}, inplace=True)
df_outpatient_procedures

Unnamed: 0,pid,outpatient_caseID,outpatient_procedure_code,outpatient_procedure_localisation,outpatient_procedure_date
0,527,288793,5-142.2,1.0,2021-07-19
1,527,288793,5-142.2,2.0,2021-07-19
2,551,3765687,5-156.9,2.0,2020-10-21
3,393,4108489,5-385.70,1.0,2020-02-16
4,232,4654625,1-650.2,,2020-01-05
...,...,...,...,...,...
114,551,200377624,5-156.9,1.0,2021-06-14
115,551,200377624,5-156.9,2.0,2021-06-14
116,551,200804681,5-156.9,2.0,2020-02-10
117,351,202453658,5-780.6w,2.0,2020-10-06


In [24]:
df_outpatient_procedures.loc[df_outpatient_procedures['pid'] == 527]

Unnamed: 0,pid,outpatient_caseID,outpatient_procedure_code,outpatient_procedure_localisation,outpatient_procedure_date
0,527,288793,5-142.2,1.0,2021-07-19
1,527,288793,5-142.2,2.0,2021-07-19


In [25]:
# 5 out of 11: insurance, insurance_data, outpatient_cases, outpatient_fees, outpatient_procedure
df_universal = pd.merge(df_universal, df_outpatient_procedures, on=['pid', 'outpatient_caseID'], how='left')
df_universal

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,...,outpatient_case_year,outpatient_case_quarter,outpatient_fees_physician_code,outpatient_fees_specialty_code,outpatient_fees_billing_code,outpatient_fees_quantity,outpatient_fees_date,outpatient_procedure_code,outpatient_procedure_localisation,outpatient_procedure_date
0,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,2018.0,3.0,610229105.0,5.0,06222,1.0,2018-09-13,,,
1,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,2018.0,3.0,610229105.0,5.0,06220,1.0,2018-09-13,,,
2,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,2018.0,3.0,610229105.0,5.0,06225E,1.0,2018-09-13,,,
3,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,2018.0,3.0,610229105.0,5.0,06220T,1.0,2018-09-13,,,
4,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,2018.0,3.0,610229105.0,5.0,06225T,1.0,2018-09-13,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67314,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,...,2020.0,4.0,636052610.0,10.0,18220,1.0,2020-11-08,,,
67315,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,...,2020.0,4.0,636052610.0,10.0,32001,1.0,2020-11-08,,,
67316,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,...,2020.0,4.0,636052610.0,10.0,18222,1.0,2020-11-08,,,
67317,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,...,2020.0,4.0,636052610.0,10.0,30201,1.0,2020-11-08,,,


In [26]:
# Example for insurant with pid= 527 we already have 160 rows
df_universal.loc[df_universal['pid'] == 527]

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,...,outpatient_case_year,outpatient_case_quarter,outpatient_fees_physician_code,outpatient_fees_specialty_code,outpatient_fees_billing_code,outpatient_fees_quantity,outpatient_fees_date,outpatient_procedure_code,outpatient_procedure_localisation,outpatient_procedure_date
35824,527,1944,1,2018-01-01,2018-12-31,0.0,14.0,50626975.0,843745567.0,2018-04-01,...,2018.0,2.0,248584703.0,3.0,03230,1.0,2018-05-09,,,
35825,527,1944,1,2018-01-01,2018-12-31,0.0,14.0,50626975.0,843745567.0,2018-04-01,...,2018.0,2.0,248584703.0,3.0,03230,1.0,2018-04-25,,,
35826,527,1944,1,2018-01-01,2018-12-31,0.0,14.0,50626975.0,843745567.0,2018-04-01,...,2018.0,2.0,248584703.0,3.0,03362,1.0,2018-04-25,,,
35827,527,1944,1,2018-01-01,2018-12-31,0.0,14.0,50626975.0,843745567.0,2018-04-01,...,2018.0,2.0,248584703.0,3.0,03221,1.0,2018-04-25,,,
35828,527,1944,1,2018-01-01,2018-12-31,0.0,14.0,50626975.0,843745567.0,2018-04-01,...,2018.0,2.0,248584703.0,3.0,03230,1.0,2018-04-01,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35969,527,1944,1,2019-01-01,2019-12-31,0.0,14.0,169078112.0,830095166.0,2019-09-26,...,2019.0,3.0,264712410.0,10.0,30420A,1.0,2019-09-26,,,
35970,527,1944,1,2019-01-01,2019-12-31,0.0,14.0,169078112.0,830095166.0,2019-09-26,...,2019.0,3.0,264712410.0,10.0,30201,1.0,2019-09-26,,,
35971,527,1944,1,2019-01-01,2019-12-31,0.0,14.0,169078112.0,830095166.0,2019-09-26,...,2019.0,3.0,264712410.0,10.0,18227,1.0,2019-09-26,,,
35972,527,1944,1,2019-01-01,2019-12-31,0.0,14.0,169078112.0,830095166.0,2019-09-26,...,2019.0,3.0,264712410.0,10.0,18212,1.0,2019-09-26,,,


In [27]:
df_outpatient_diagnosis = pd.read_csv('wig2/testdata/test.outpatient_diagnosis.csv', sep='\t')
df_outpatient_diagnosis.rename(columns={"caseID": "outpatient_caseID", "diagnosis": "outpatient_diagnosis", "qualification": "outpatient_diagnosis_qualification",
                                "localisation": "outpatient_diagnosis_localisation"}, inplace=True)

In [28]:
# Check example diagnosis of insurant with pid=1
df_outpatient_diagnosis.loc[df_outpatient_diagnosis['pid'] == 1]

Unnamed: 0,pid,outpatient_caseID,outpatient_diagnosis,outpatient_diagnosis_qualification,outpatient_diagnosis_localisation
7208,1,4908567,M350,G,3.0
7209,1,4908567,H522,G,3.0
7210,1,4908567,H521,G,3.0
7211,1,4908567,H311,G,3.0
7212,1,4908567,H193,G,
7213,1,4908567,H041,G,3.0


In [29]:
# 6 out of 11: insurance, insurance_data, outpatient_cases, outpatient_fees, outpatient_procedure, outpatient_diagnosis
df_universal_6 = pd.merge(df_universal, df_outpatient_diagnosis, on=['pid', 'outpatient_caseID'], how='left')
df_universal_6

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,...,outpatient_fees_specialty_code,outpatient_fees_billing_code,outpatient_fees_quantity,outpatient_fees_date,outpatient_procedure_code,outpatient_procedure_localisation,outpatient_procedure_date,outpatient_diagnosis,outpatient_diagnosis_qualification,outpatient_diagnosis_localisation
0,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,5.0,06222,1.0,2018-09-13,,,,M350,G,3.0
1,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,5.0,06222,1.0,2018-09-13,,,,H522,G,3.0
2,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,5.0,06222,1.0,2018-09-13,,,,H521,G,3.0
3,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,5.0,06222,1.0,2018-09-13,,,,H311,G,3.0
4,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,5.0,06222,1.0,2018-09-13,,,,H193,G,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417755,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,...,10.0,18211,1.0,2020-11-08,,,,Q667,G,3.0
417756,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,...,10.0,18211,1.0,2020-11-08,,,,M9983,G,
417757,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,...,10.0,18211,1.0,2020-11-08,,,,M773,G,3.0
417758,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,...,10.0,18211,1.0,2020-11-08,,,,M5416,G,


In [30]:
# Example for insurant with pid= 527 we already have 1022 rows, after joining 6 tables
df_universal_6.loc[df_universal_6['pid'] == 527]

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,...,outpatient_fees_specialty_code,outpatient_fees_billing_code,outpatient_fees_quantity,outpatient_fees_date,outpatient_procedure_code,outpatient_procedure_localisation,outpatient_procedure_date,outpatient_diagnosis,outpatient_diagnosis_qualification,outpatient_diagnosis_localisation
221100,527,1944,1,2018-01-01,2018-12-31,0.0,14.0,50626975.0,843745567.0,2018-04-01,...,3.0,03230,1.0,2018-05-09,,,,Z907,G,
221101,527,1944,1,2018-01-01,2018-12-31,0.0,14.0,50626975.0,843745567.0,2018-04-01,...,3.0,03230,1.0,2018-05-09,,,,U5000,G,
221102,527,1944,1,2018-01-01,2018-12-31,0.0,14.0,50626975.0,843745567.0,2018-04-01,...,3.0,03230,1.0,2018-05-09,,,,R54,G,
221103,527,1944,1,2018-01-01,2018-12-31,0.0,14.0,50626975.0,843745567.0,2018-04-01,...,3.0,03230,1.0,2018-05-09,,,,M939,G,
221104,527,1944,1,2018-01-01,2018-12-31,0.0,14.0,50626975.0,843745567.0,2018-04-01,...,3.0,03230,1.0,2018-05-09,,,,M8199,G,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222117,527,1944,1,2019-01-01,2019-12-31,0.0,14.0,169078112.0,830095166.0,2019-09-26,...,10.0,32001,1.0,2019-09-26,,,,M4292,G,
222118,527,1944,1,2019-01-01,2019-12-31,0.0,14.0,169078112.0,830095166.0,2019-09-26,...,10.0,32001,1.0,2019-09-26,,,,M4212,G,
222119,527,1944,1,2019-01-01,2019-12-31,0.0,14.0,169078112.0,830095166.0,2019-09-26,...,10.0,32001,1.0,2019-09-26,,,,M0608,G,
222120,527,1944,1,2019-01-01,2019-12-31,0.0,14.0,169078112.0,830095166.0,2019-09-26,...,10.0,32001,1.0,2019-09-26,,,,L930,G,


In [159]:
df_universal_6.to_csv("df_outpatient_6.csv", sep=',', index=True, encoding='utf-8')

In [None]:

df_universal = pd.merge(df_universal, df_outpatient_diagnosis, on=['pid', 'caseID'], how='left')

df_universal = pd.merge(df_universal, df_inpatient_cases, on='pid', how='left')
df_universal = pd.merge(df_universal, df_inpatient_diagnosis, on=['pid', 'caseID'], how='left')
df_universal = pd.merge(df_universal, df_inpatient_fees, on=['pid', 'caseID'], how='left')
df_universal = pd.merge(df_universal, df_inpatient_procedures, on=['pid', 'caseID'], how='left')
df_universal = pd.merge(df_universal, df_drugs, on='pid', how='left')

In [31]:
a = df_outpatient_procedures['pid'].unique()
print(sorted(a))

[np.int64(10), np.int64(12), np.int64(15), np.int64(66), np.int64(76), np.int64(84), np.int64(88), np.int64(90), np.int64(91), np.int64(96), np.int64(112), np.int64(115), np.int64(128), np.int64(135), np.int64(166), np.int64(179), np.int64(194), np.int64(227), np.int64(232), np.int64(239), np.int64(261), np.int64(268), np.int64(315), np.int64(316), np.int64(320), np.int64(322), np.int64(327), np.int64(328), np.int64(330), np.int64(338), np.int64(351), np.int64(353), np.int64(390), np.int64(393), np.int64(396), np.int64(399), np.int64(412), np.int64(420), np.int64(428), np.int64(483), np.int64(517), np.int64(519), np.int64(521), np.int64(527), np.int64(541), np.int64(548), np.int64(551), np.int64(575), np.int64(586), np.int64(592), np.int64(596), np.int64(597), np.int64(634), np.int64(635), np.int64(648), np.int64(675), np.int64(696), np.int64(697), np.int64(716), np.int64(719), np.int64(731), np.int64(763), np.int64(776), np.int64(779), np.int64(807), np.int64(809), np.int64(842), np.i

In [32]:
df_drugs

Unnamed: 0,pid,date of prescription,date of dispense,pharma central number,specialty of prescriber,physican code,practice code,quantity,amount due,atc,ddd
0,542,2020-04-24,2020-04-26,6444040,52.0,11441752.0,462545519.0,2.0,28.30,N02BB02,8.333
1,35,2019-06-21,2019-06-23,3507952,15.0,100231815.0,118275942.0,1.0,16.27,N02BB02,16.667
2,590,2020-11-27,2020-11-27,3507952,1.0,86391701.0,675905294.0,1.0,16.27,N02BB02,16.667
3,345,2020-02-01,2020-02-02,2754708,1.0,650812501.0,861471675.0,1.0,14.54,H03AA01,50.000
4,345,2020-04-19,2020-04-23,2754708,1.0,650812501.0,861471675.0,1.0,14.54,H03AA01,50.000
...,...,...,...,...,...,...,...,...,...,...,...
9394,384,2018-12-15,2018-12-20,4596863,0.0,621898700.0,14632358.0,1.0,31.56,P01BA02,30.039
9395,814,2018-12-14,2018-12-20,4596863,31.0,768128631.0,794650148.0,1.0,31.56,P01BA02,30.039
9396,581,2018-04-07,2018-04-07,4596863,1.0,363816801.0,893595739.0,1.0,31.56,P01BA02,30.039
9397,581,2018-07-08,2018-07-18,4596863,1.0,363816801.0,893595739.0,1.0,31.56,P01BA02,30.039


In [34]:
df_drugs = df_drugs.add_prefix('drugs_')
df_drugs.columns = df_drugs.columns.str.replace(' ', '_')

In [35]:
df_drugs

Unnamed: 0,drugs_pid,drugs_date_of_prescription,drugs_date_of_dispense,drugs_pharma_central_number,drugs_specialty_of_prescriber,drugs_physican_code,drugs_practice_code,drugs_quantity,drugs_amount_due,drugs_atc,drugs_ddd
0,542,2020-04-24,2020-04-26,6444040,52.0,11441752.0,462545519.0,2.0,28.30,N02BB02,8.333
1,35,2019-06-21,2019-06-23,3507952,15.0,100231815.0,118275942.0,1.0,16.27,N02BB02,16.667
2,590,2020-11-27,2020-11-27,3507952,1.0,86391701.0,675905294.0,1.0,16.27,N02BB02,16.667
3,345,2020-02-01,2020-02-02,2754708,1.0,650812501.0,861471675.0,1.0,14.54,H03AA01,50.000
4,345,2020-04-19,2020-04-23,2754708,1.0,650812501.0,861471675.0,1.0,14.54,H03AA01,50.000
...,...,...,...,...,...,...,...,...,...,...,...
9394,384,2018-12-15,2018-12-20,4596863,0.0,621898700.0,14632358.0,1.0,31.56,P01BA02,30.039
9395,814,2018-12-14,2018-12-20,4596863,31.0,768128631.0,794650148.0,1.0,31.56,P01BA02,30.039
9396,581,2018-04-07,2018-04-07,4596863,1.0,363816801.0,893595739.0,1.0,31.56,P01BA02,30.039
9397,581,2018-07-08,2018-07-18,4596863,1.0,363816801.0,893595739.0,1.0,31.56,P01BA02,30.039


In [36]:
# 7 out of 11: insurance, insurance_data, outpatient_cases, outpatient_fees, outpatient_procedure, outpatient_diagnosis, drugs
df_universal_7 = pd.merge(df_universal_6, df_drugs, left_on='pid', right_on='drugs_pid', how='left')
df_universal_7

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,...,drugs_date_of_prescription,drugs_date_of_dispense,drugs_pharma_central_number,drugs_specialty_of_prescriber,drugs_physican_code,drugs_practice_code,drugs_quantity,drugs_amount_due,drugs_atc,drugs_ddd
0,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,2020-12-13,2020-12-14,7387887.0,1.0,999393101.0,793529553.0,1.0,12.42,N02BB02,3.333
1,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,2020-12-13,2020-12-14,7387887.0,1.0,999393101.0,793529553.0,1.0,12.42,N02BB02,3.333
2,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,2020-12-13,2020-12-14,7387887.0,1.0,999393101.0,793529553.0,1.0,12.42,N02BB02,3.333
3,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,2020-12-13,2020-12-14,7387887.0,1.0,999393101.0,793529553.0,1.0,12.42,N02BB02,3.333
4,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,2020-12-13,2020-12-14,7387887.0,1.0,999393101.0,793529553.0,1.0,12.42,N02BB02,3.333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6352857,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,...,2020-10-29,2020-11-05,6912972.0,1.0,102554301.0,176657311.0,1.0,16.96,H03AA01,116.667
6352858,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,...,2019-12-07,2019-12-07,6912972.0,1.0,102554301.0,176657311.0,1.0,16.96,H03AA01,116.667
6352859,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,...,2019-07-17,2019-07-18,11368832.0,15.0,147522015.0,408591171.0,1.0,12.76,A02BC02,28.000
6352860,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,...,2018-06-03,2018-06-03,938982.0,3.0,194276003.0,817162924.0,1.0,12.03,A02BC02,14.000


In [39]:
df_inpatient_cases = pd.read_csv('wig2/testdata/test.inpatient_cases.csv', sep='\t')
df_inpatient_cases = df_inpatient_cases.add_prefix('inpatient_case_')
df_inpatient_cases.columns = df_inpatient_cases.columns.str.replace(' ', '_')
df_inpatient_cases.rename(columns={"inpatient_case_caseID": "inpatient_caseID"}, inplace=True)
df_inpatient_cases

Unnamed: 0,inpatient_case_pid,inpatient_caseID,inpatient_case_date_of_admission,inpatient_case_date_of_discharge,inpatient_case_cause_of_admission,inpatient_case_cause_of_discharge,inpatient_case_outpatient_treatment,inpatient_case_department_admission,inpatient_case_department_discharge
0,6,5556089,2019-06-14,2019-06-14,,,1,,
1,6,1568766,2019-08-30,2019-08-30,,,1,,
2,6,2714453,2019-03-30,2019-03-30,,,1,,
3,6,364774,2018-10-27,2018-10-27,,,1,,
4,6,11633323,2018-07-06,2018-07-07,101.0,6.0,0,100.0,100.0
...,...,...,...,...,...,...,...,...,...
721,986,9750153,2019-11-21,2019-11-21,,,1,,
722,986,9117719,2019-10-25,2019-10-26,107.0,1.0,0,100.0,100.0
723,987,6300871,2019-01-10,2019-01-13,101.0,1.0,0,3400.0,3400.0
724,995,370981,2018-12-29,2018-12-29,,,1,,


In [40]:
# 8 out of 11: insurance, insurance_data, outpatient_cases, outpatient_fees, outpatient_procedure, outpatient_diagnosis, drugs, inpatient_case
df_universal_8 = pd.merge(df_universal_7, df_inpatient_cases, left_on='pid', right_on='inpatient_case_pid', how='left')
df_universal_8

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,...,drugs_ddd,inpatient_case_pid,inpatient_caseID,inpatient_case_date_of_admission,inpatient_case_date_of_discharge,inpatient_case_cause_of_admission,inpatient_case_cause_of_discharge,inpatient_case_outpatient_treatment,inpatient_case_department_admission,inpatient_case_department_discharge
0,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,3.333,,,,,,,,,
1,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,3.333,,,,,,,,,
2,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,3.333,,,,,,,,,
3,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,3.333,,,,,,,,,
4,1,1995,1,2018-01-01,2018-12-31,0.0,16.0,4908567.0,658930651.0,2018-09-13,...,3.333,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18080870,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,...,116.667,,,,,,,,,
18080871,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,...,116.667,,,,,,,,,
18080872,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,...,28.000,,,,,,,,,
18080873,1000,1970,1,2020-01-01,2020-12-31,0.0,8.0,13347761.0,58209611.0,2020-11-08,...,14.000,,,,,,,,,


In [41]:
df_universal_8 = filter_by_timeframe(
    df_universal_8,
    insurance_from_col="insurance_from",
    insurance_to_col="insurance_to",
    case_from_col="inpatient_case_date_of_admission",
    case_to_col="inpatient_case_date_of_discharge"
)

df_universal_8

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,...,drugs_ddd,inpatient_case_pid,inpatient_caseID,inpatient_case_date_of_admission,inpatient_case_date_of_discharge,inpatient_case_cause_of_admission,inpatient_case_cause_of_discharge,inpatient_case_outpatient_treatment,inpatient_case_department_admission,inpatient_case_department_discharge
7131,6,1979,1,2019-01-01,2019-12-18,0.0,8.0,205185163.0,712978181.0,2019-04-10,...,50.000,6.0,5556089.0,2019-06-14,2019-06-14,,,1.0,,
7132,6,1979,1,2019-01-01,2019-12-18,0.0,8.0,205185163.0,712978181.0,2019-04-10,...,50.000,6.0,1568766.0,2019-08-30,2019-08-30,,,1.0,,
7133,6,1979,1,2019-01-01,2019-12-18,0.0,8.0,205185163.0,712978181.0,2019-04-10,...,50.000,6.0,2714453.0,2019-03-30,2019-03-30,,,1.0,,
7136,6,1979,1,2019-01-01,2019-12-18,0.0,8.0,205185163.0,712978181.0,2019-04-10,...,50.000,6.0,10361408.0,2019-10-05,2019-10-05,,,1.0,,
7138,6,1979,1,2019-01-01,2019-12-18,0.0,8.0,205185163.0,712978181.0,2019-04-10,...,30.039,6.0,5556089.0,2019-06-14,2019-06-14,,,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18050319,987,1978,1,2019-01-01,2019-12-31,0.0,5.0,94755127.0,570365619.0,2019-08-18,...,50.000,987.0,6300871.0,2019-01-10,2019-01-13,101.0,1.0,0.0,3400.0,3400.0
18050320,987,1978,1,2019-01-01,2019-12-31,0.0,5.0,94755127.0,570365619.0,2019-08-18,...,200.000,987.0,6300871.0,2019-01-10,2019-01-13,101.0,1.0,0.0,3400.0,3400.0
18050321,987,1978,1,2019-01-01,2019-12-31,0.0,5.0,94755127.0,570365619.0,2019-08-18,...,30.039,987.0,6300871.0,2019-01-10,2019-01-13,101.0,1.0,0.0,3400.0,3400.0
18050322,987,1978,1,2019-01-01,2019-12-31,0.0,5.0,94755127.0,570365619.0,2019-08-18,...,30.039,987.0,6300871.0,2019-01-10,2019-01-13,101.0,1.0,0.0,3400.0,3400.0


In [45]:
df_inpatient_diagnosis = pd.read_csv('wig2/testdata/test.inpatient_diagnosis.csv', sep='\t')
df_inpatient_diagnosis = df_inpatient_diagnosis.add_prefix('inpatient_diagnosis_')
df_inpatient_diagnosis.columns = df_inpatient_diagnosis.columns.str.replace(' ', '_')
df_inpatient_diagnosis.rename(columns={"inpatient_diagnosis_pid": "pid", "inpatient_diagnosis_caseID": "inpatient_caseID"}, inplace=True)
df_inpatient_diagnosis

Unnamed: 0,pid,inpatient_caseID,inpatient_diagnosis_diagnosis,inpatient_diagnosis_type_of_diagnosis,inpatient_diagnosis_is_main_diagnosis,inpatient_diagnosis_Localisation
0,227,104770,Z019,2,1,
1,225,111066,M351,7,1,
2,225,111066,T887,8,0,
3,225,111066,R253,8,0,
4,225,111066,N028,8,0,
...,...,...,...,...,...,...
1481,936,12095459,M328,7,1,
1482,936,12095459,Z888,8,0,
1483,936,12095459,Z886,8,0,
1484,936,12095459,R768,8,0,


In [46]:
# 9 out of 11: insurance, insurance_data, outpatient_cases, outpatient_fees, outpatient_procedure, outpatient_diagnosis, drugs, inpatient_case, inpatient_diagnosis
df_universal_9 = pd.merge(df_universal_8, df_inpatient_diagnosis, on=['pid', 'inpatient_caseID'], how='left')
df_universal_9

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,...,inpatient_case_date_of_discharge,inpatient_case_cause_of_admission,inpatient_case_cause_of_discharge,inpatient_case_outpatient_treatment,inpatient_case_department_admission,inpatient_case_department_discharge,inpatient_diagnosis_diagnosis,inpatient_diagnosis_type_of_diagnosis,inpatient_diagnosis_is_main_diagnosis,inpatient_diagnosis_Localisation
0,6,1979,1,2019-01-01,2019-12-18,0.0,8.0,205185163.0,712978181.0,2019-04-10,...,2019-06-14,,,1.0,,,Z019,2,1,
1,6,1979,1,2019-01-01,2019-12-18,0.0,8.0,205185163.0,712978181.0,2019-04-10,...,2019-08-30,,,1.0,,,Z019,2,1,
2,6,1979,1,2019-01-01,2019-12-18,0.0,8.0,205185163.0,712978181.0,2019-04-10,...,2019-03-30,,,1.0,,,Z019,2,1,
3,6,1979,1,2019-01-01,2019-12-18,0.0,8.0,205185163.0,712978181.0,2019-04-10,...,2019-10-05,,,1.0,,,M328,2,1,
4,6,1979,1,2019-01-01,2019-12-18,0.0,8.0,205185163.0,712978181.0,2019-04-10,...,2019-10-05,,,1.0,,,Z34,2,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7998468,987,1978,1,2019-01-01,2019-12-31,0.0,5.0,94755127.0,570365619.0,2019-08-18,...,2019-01-13,101.0,1.0,0.0,3400.0,3400.0,L932,7,1,
7998469,987,1978,1,2019-01-01,2019-12-31,0.0,5.0,94755127.0,570365619.0,2019-08-18,...,2019-01-13,101.0,1.0,0.0,3400.0,3400.0,N390,8,0,
7998470,987,1978,1,2019-01-01,2019-12-31,0.0,5.0,94755127.0,570365619.0,2019-08-18,...,2019-01-13,101.0,1.0,0.0,3400.0,3400.0,L932,4,0,
7998471,987,1978,1,2019-01-01,2019-12-31,0.0,5.0,94755127.0,570365619.0,2019-08-18,...,2019-01-13,101.0,1.0,0.0,3400.0,3400.0,L932,11,0,


In [47]:
df_inpatient_fees = pd.read_csv('wig2/testdata/test.inpatient_fees.csv', sep='\t')
df_inpatient_fees = df_inpatient_fees.add_prefix('inpatient_fees_')
df_inpatient_fees.columns = df_inpatient_fees.columns.str.replace(' ', '_')
df_inpatient_fees.rename(columns={"inpatient_fees_pid": "pid", "inpatient_fees_caseID": "inpatient_caseID"}, inplace=True)
df_inpatient_fees

Unnamed: 0,pid,inpatient_caseID,inpatient_fees_from,inpatient_fees_to,inpatient_fees_billing_code,inpatient_fees_amount_due,inpatient_fees_quantity
0,542,1966155,2020-10-27,2020-10-27,21000000,299.99,1.0
1,867,6280672,2019-09-01,2019-09-01,21000010,4.55,1.0
2,867,6280672,2019-09-01,2019-09-01,21000000,103.30,1.0
3,987,6300871,2019-01-10,2019-01-12,75105002,84.14,1.0
4,987,6300871,2019-01-10,2019-01-12,7010J61C,2311.12,1.0
...,...,...,...,...,...,...,...
1341,343,11281843,2019-10-02,2019-10-02,75105002,136.66,1.0
1342,343,11281843,2019-10-02,2019-10-02,7310Z64D,-449.57,1.0
1343,343,11281843,2019-10-02,2019-10-02,7010Z64D,1321.87,1.0
1344,343,11281843,2019-10-02,2019-10-02,49120001,10.76,1.0


In [48]:
# 10 out of 11: insurance, insurance_data, outpatient_cases, outpatient_fees, outpatient_procedure, outpatient_diagnosis, drugs, inpatient_case, inpatient_diagnosis, inpatient_fees
df_universal_10 = pd.merge(df_universal_9, df_inpatient_fees, on=['pid', 'inpatient_caseID'], how='left')
df_universal_10

Unnamed: 0,pid,Year of birth,gender,insurance_from,insurance_to,death,regional_code,outpatient_caseID,outpatient_case_practice_code,outpatient_case_from,...,inpatient_case_department_discharge,inpatient_diagnosis_diagnosis,inpatient_diagnosis_type_of_diagnosis,inpatient_diagnosis_is_main_diagnosis,inpatient_diagnosis_Localisation,inpatient_fees_from,inpatient_fees_to,inpatient_fees_billing_code,inpatient_fees_amount_due,inpatient_fees_quantity
0,6,1979,1,2019-01-01,2019-12-18,0.0,8.0,205185163.0,712978181.0,2019-04-10,...,,Z019,2,1,,2019-06-14,2019-06-14,25101101,203.29,1.0
1,6,1979,1,2019-01-01,2019-12-18,0.0,8.0,205185163.0,712978181.0,2019-04-10,...,,Z019,2,1,,2019-06-14,2019-06-14,21000010,4.72,1.0
2,6,1979,1,2019-01-01,2019-12-18,0.0,8.0,205185163.0,712978181.0,2019-04-10,...,,Z019,2,1,,2019-08-30,2019-08-30,25101101,203.29,1.0
3,6,1979,1,2019-01-01,2019-12-18,0.0,8.0,205185163.0,712978181.0,2019-04-10,...,,Z019,2,1,,2019-08-30,2019-08-30,21000010,4.72,1.0
4,6,1979,1,2019-01-01,2019-12-18,0.0,8.0,205185163.0,712978181.0,2019-04-10,...,,Z019,2,1,,2019-03-30,2019-03-30,25101101,203.29,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18885522,987,1978,1,2019-01-01,2019-12-31,0.0,5.0,94755127.0,570365619.0,2019-08-18,...,3400.0,L718,8,0,,2019-01-10,2019-01-12,75105002,84.14,1.0
18885523,987,1978,1,2019-01-01,2019-12-31,0.0,5.0,94755127.0,570365619.0,2019-08-18,...,3400.0,L718,8,0,,2019-01-10,2019-01-12,7010J61C,2311.12,1.0
18885524,987,1978,1,2019-01-01,2019-12-31,0.0,5.0,94755127.0,570365619.0,2019-08-18,...,3400.0,L718,8,0,,2019-01-10,2019-01-12,7010J61C,-1651.74,1.0
18885525,987,1978,1,2019-01-01,2019-12-31,0.0,5.0,94755127.0,570365619.0,2019-08-18,...,3400.0,L718,8,0,,2019-01-10,2019-01-12,48000001,1.30,1.0


In [51]:
df_inpatient_procedures = pd.read_csv('wig2/testdata/test.inpatient_procedures.csv', sep='\t')
df_inpatient_procedures = df_inpatient_procedures.add_prefix('inpatient_procedures_')
df_inpatient_procedures.columns = df_inpatient_procedures.columns.str.replace(' ', '_')
df_inpatient_procedures.rename(columns={"inpatient_procedures_pid": "pid", "inpatient_procedures_caseID": "inpatient_caseID",
                                        "inpatient_procedures_date_of_procedure": "inpatient_date_of_procedure"}, inplace=True)
df_inpatient_procedures

Unnamed: 0,pid,inpatient_caseID,inpatient_procedures_procedure_code,inpatient_procedures_localisation,inpatient_date_of_procedure
0,225,111066,8-824,9,2018-01-27
1,225,111066,8-824,9,2018-01-28
2,225,111066,6-002.f3,9,2018-01-26
3,233,174381,8-810.w3,9,2019-01-19
4,233,174381,3-990,9,2019-01-20
...,...,...,...,...,...
339,619,11851394,1-465.0,2,2019-03-15
340,782,11962177,8-542.11,9,2019-10-18
341,936,12095459,9-411.04,9,2020-01-23
342,936,12095459,9-410.05,9,2020-01-25


In [52]:
# 11 out of 11: insurance, insurance_data, outpatient_cases, outpatient_fees, outpatient_procedure, outpatient_diagnosis, drugs, inpatient_case, inpatient_diagnosis,
# inpatient_fees, inpatient_procedures
df_universal_11 = pd.merge(df_universal_10, df_inpatient_procedures, on=['pid', 'inpatient_caseID'], how='left')
df_universal_11

: 

In [109]:
df_drugs.loc[df_drugs['pid'] == 1]


Unnamed: 0,pid,date of prescription,date of dispense,pharma central number,specialty of prescriber,physican code,practice code,quantity,amount due,atc,ddd
2002,1,2020-12-13,2020-12-14,7387887,1.0,999393101.0,793529553.0,1.0,12.42,N02BB02,3.333
