**ALLERGIES**

In [47]:
import pandas as pd

# Load the allergies data from the CSV file
file_path = '10k_synthea_covid19_csv/allergies.csv'  # Adjust the path if necessary
df = pd.read_csv(file_path)

# View the first few rows to understand the structure of the data
# print("Original DataFrame:")
# print(df.head())

# Ensure the DESCRIPTION column doesn't have leading/trailing spaces
df['DESCRIPTION'] = df['DESCRIPTION'].str.strip()

# Get unique allergy descriptions from the DESCRIPTION column
unique_allergies = df['DESCRIPTION'].unique()

# Create a count encoding for each unique allergy description
for allergy in unique_allergies:
    df[allergy] = df['DESCRIPTION'].apply(lambda x: 1 if x == allergy else 0)

# Drop the 'DESCRIPTION' and other columns we don't need
df = df.drop(columns=['DESCRIPTION', 'ENCOUNTER', 'START', 'STOP', 'CODE'])

# Create a list of allergy columns (allergy columns are now binary columns)
allergy_columns = [col for col in df.columns if col != 'PATIENT']

# Now, group by the patient ID to consolidate the multiple encounters into one row per patient
# Use 'sum' instead of 'max' to count the number of times each allergy appears
df_patient = df.groupby('PATIENT')[allergy_columns].sum().reset_index()

# View the processed DataFrame (grouped by patient)
print("\nProcessed DataFrame (grouped by patient with allergy counts):")
print(df_patient.head())

# Save the resulting DataFrame to a new CSV file
output_path = 'processed_allergies.csv'
df_patient.to_csv(output_path, index=False)

print(f"Processed data has been saved to {output_path}")



Processed DataFrame (grouped by patient with allergy counts):
                                PATIENT  Allergy to bee venom  \
0  00049ee8-5953-4edd-a277-b9c1b1a7f16b                     1   
1  00093cdd-a9f0-4ad8-87e9-53534501f008                     0   
2  001683f0-8546-4ac2-9153-dd1a9ffe29cd                     0   
3  001890f9-3149-4347-ad12-739ad6db59cd                     0   
4  001d5d6d-0818-4f61-8dfa-48dc1abe4c95                     0   

   Allergy to grass pollen  Allergy to tree pollen  Allergy to fish  \
0                        0                       1                0   
1                        1                       0                0   
2                        0                       0                0   
3                        0                       0                0   
4                        0                       1                1   

   Allergy to nut  Allergy to mould  Dander (animal) allergy  \
0               0                 1                    

**CarePlan**

In [48]:
# Load the care plans data from the CSV file
file_path = '10k_synthea_covid19_csv/careplans.csv'  # Adjust the path if necessary
df = pd.read_csv(file_path)

# View the first few rows to understand the structure of the data
print("Original DataFrame:")
print(df.head())

# Ensure the DESCRIPTION and REASONDESCRIPTION columns don't have leading/trailing spaces
df['DESCRIPTION'] = df['DESCRIPTION'].str.strip()
df['REASONDESCRIPTION'] = df['REASONDESCRIPTION'].str.strip()

# Get unique care plan descriptions from the DESCRIPTION column
unique_careplans = df['DESCRIPTION'].unique()

# Get unique reason descriptions from the REASONDESCRIPTION column
unique_reasons = df['REASONDESCRIPTION'].unique()

# Create a count encoding for each unique care plan description
for careplan in unique_careplans:
    df[careplan] = df['DESCRIPTION'].apply(lambda x: 1 if x == careplan else 0)

# Create a count encoding for each unique reason description
for reason in unique_reasons:
    df[reason] = df['REASONDESCRIPTION'].apply(lambda x: 1 if x == reason else 0)

# Drop the unnecessary columns after encoding
df = df.drop(columns=['Id', 'START', 'STOP', 'ENCOUNTER', 'CODE', 'DESCRIPTION', 'REASONCODE', 'REASONDESCRIPTION'])

# Create a list of care plan columns (binary columns for both DESCRIPTION and REASONDESCRIPTION)
careplan_columns = [col for col in df.columns if col != 'PATIENT']

# Now, group by the patient ID to consolidate the multiple care plans into one row per patient
# Apply 'sum' to count the occurrences of each care plan (we only keep 'PATIENT' and the care plan columns)
df_patient = df.groupby('PATIENT')[careplan_columns].sum().reset_index()

# View the processed DataFrame (patient + care plan count encoding)
print("\nProcessed DataFrame (grouped by patient and care plans only):")
print(df_patient.head())

# Save the resulting DataFrame to a new CSV file
output_path = 'processed_careplans.csv'
df_patient.to_csv(output_path, index=False)

print(f"Processed data has been saved to {output_path}")


Original DataFrame:
                                     Id       START        STOP  \
0  fea43343-7312-423f-bb82-b2f5ae71a260  2020-03-01  2020-03-01   
1  cbcade35-42bf-4807-8154-3f7f847221e0  2020-03-01  2020-03-30   
2  51dd78df-2b01-486a-8b33-1fbcd9cec211  2020-02-12  2020-02-26   
3  8aa5055b-cddc-4170-9e31-e71e5552502a  2020-03-13  2020-03-13   
4  976d369a-2b71-488d-ba20-8674fc272be0  2020-03-13  2020-04-14   

                                PATIENT                             ENCOUNTER  \
0  f0f3bc8d-ef38-49ce-a2bd-dfdda982b271  681c380b-3c84-4c55-80a6-db3d9ea12fee   
1  f0f3bc8d-ef38-49ce-a2bd-dfdda982b271  681c380b-3c84-4c55-80a6-db3d9ea12fee   
2  067318a4-db8f-447f-8b6e-f2f61e9baaa5  adedca64-700b-4fb9-82f1-9cbb658abb73   
3  067318a4-db8f-447f-8b6e-f2f61e9baaa5  1ea74a77-3ad3-4948-a9cc-3084462035d6   
4  067318a4-db8f-447f-8b6e-f2f61e9baaa5  1ea74a77-3ad3-4948-a9cc-3084462035d6   

        CODE                                     DESCRIPTION   REASONCODE  \
0  736376001 

**Conditions**

In [49]:
# Load the conditions data from the CSV file
file_path = '10k_synthea_covid19_csv/conditions.csv'  # Adjust the path if necessary
df = pd.read_csv(file_path)

# View the first few rows to understand the structure of the data
print("Original DataFrame:")
print(df.head())

# Ensure the DESCRIPTION column doesn't have leading/trailing spaces
df['DESCRIPTION'] = df['DESCRIPTION'].str.strip()

# Get unique condition descriptions from the DESCRIPTION column
unique_conditions = df['DESCRIPTION'].unique()

# Create a count encoding for each unique condition description
for condition in unique_conditions:
    df[condition] = df['DESCRIPTION'].apply(lambda x: 1 if x == condition else 0)

# Drop the 'DESCRIPTION' and other unnecessary columns after encoding them
df = df.drop(columns=['ENCOUNTER', 'START', 'STOP', 'CODE', 'DESCRIPTION'])

# Create a list of condition columns (binary columns)
condition_columns = [col for col in df.columns if col != 'PATIENT']

# Now, group by the patient ID to consolidate the multiple conditions into one row per patient
# Apply 'sum' to the condition columns to count the occurrences of each condition (instead of taking 'max')
df_patient = df.groupby('PATIENT')[condition_columns].sum().reset_index()

# View the processed DataFrame (patient + condition count encoding)
print("\nProcessed DataFrame (grouped by patient and conditions only):")
print(df_patient.head())

# Save the resulting DataFrame to a new CSV file
output_path = 'processed_conditions.csv'
df_patient.to_csv(output_path, index=False)

print(f"Processed data has been saved to {output_path}")


Original DataFrame:
        START        STOP                               PATIENT  \
0  2019-02-15  2019-08-01  f0f3bc8d-ef38-49ce-a2bd-dfdda982b271   
1  2019-10-30  2020-01-30  f0f3bc8d-ef38-49ce-a2bd-dfdda982b271   
2  2020-03-01  2020-03-30  f0f3bc8d-ef38-49ce-a2bd-dfdda982b271   
3  2020-03-01  2020-03-01  f0f3bc8d-ef38-49ce-a2bd-dfdda982b271   
4  2020-03-01  2020-03-30  f0f3bc8d-ef38-49ce-a2bd-dfdda982b271   

                              ENCOUNTER       CODE         DESCRIPTION  
0  d5ee30a9-362f-429e-a87a-ee38d999b0a5   65363002        Otitis media  
1  8bca6d8a-ab80-4cbf-8abb-46654235f227   65363002        Otitis media  
2  681c380b-3c84-4c55-80a6-db3d9ea12fee  386661006     Fever (finding)  
3  681c380b-3c84-4c55-80a6-db3d9ea12fee  840544004  Suspected COVID-19  
4  681c380b-3c84-4c55-80a6-db3d9ea12fee  840539006            COVID-19  

Processed DataFrame (grouped by patient and conditions only):
                                PATIENT  Otitis media  Fever (finding)  \
0 

**Immunizations**

In [50]:
# Define the input and output file paths
input_file_path = '10k_synthea_covid19_csv/immunizations.csv'  # Input file path
output_file_path = 'processed_immunizations.csv'  # Output file path

# Load the immunizations data from the CSV file
df = pd.read_csv(input_file_path)

# View the first few rows to understand the structure of the data
print("Original DataFrame:")
print(df.head())

# Ensure the DESCRIPTION column doesn't have leading/trailing spaces
df['DESCRIPTION'] = df['DESCRIPTION'].str.strip()

# Get unique immunization descriptions from the DESCRIPTION column
unique_immunizations = df['DESCRIPTION'].unique()

# Create a count encoding for each unique immunization description
for immunization in unique_immunizations:
    df[immunization] = df['DESCRIPTION'].apply(lambda x: 1 if x == immunization else 0)

# Drop the unnecessary columns after encoding them
df = df.drop(columns=['ENCOUNTER', 'DATE', 'BASE_COST', 'CODE', 'DESCRIPTION'])

# Create a list of immunization columns (binary columns)
immunization_columns = [col for col in df.columns if col != 'PATIENT']

# Now, group by the patient ID to consolidate the multiple immunizations into one row per patient
# Apply 'sum' to the immunization columns to count the occurrences of each immunization (instead of using 'max')
df_patient = df.groupby('PATIENT')[immunization_columns].sum().reset_index()

# View the processed DataFrame (patient + immunization count encoding)
print("\nProcessed DataFrame (grouped by patient and immunizations only):")
print(df_patient.head())

# Save the resulting DataFrame to a new CSV file
df_patient.to_csv(output_file_path, index=False)

print(f"Processed data has been saved to {output_file_path}")


Original DataFrame:
         DATE                               PATIENT  \
0  2019-08-01  f0f3bc8d-ef38-49ce-a2bd-dfdda982b271   
1  2020-01-30  f0f3bc8d-ef38-49ce-a2bd-dfdda982b271   
2  2019-07-08  067318a4-db8f-447f-8b6e-f2f61e9baaa5   
3  2019-10-15  ae9efba3-ddc4-43f9-a781-f72019388548   
4  2020-02-21  199c586f-af16-4091-9998-ee4cfc02ee7a   

                              ENCOUNTER  CODE  \
0  6a74fdef-2287-44bf-b9e7-18012376faca   140   
1  821e57ac-9304-46a9-9f9b-83daf60e9e43    83   
2  9aa748b8-3b44-4e34-b7a8-2e56f2ca3ca2   140   
3  6f9b301a-2b06-4868-b968-4d24faac576b   140   
4  5844b770-504a-4eb4-a655-8483881dafb1   140   

                                         DESCRIPTION  BASE_COST  
0  Influenza  seasonal  injectable  preservative ...     140.52  
1                            Hep A  ped/adol  2 dose     140.52  
2  Influenza  seasonal  injectable  preservative ...     140.52  
3  Influenza  seasonal  injectable  preservative ...     140.52  
4  Influenza  seasonal  

**Medications**

In [4]:

# Define the input and output file paths
input_file_path = '10k_synthea_covid19_csv/medications.csv'  # Input file path
output_file_path = 'processed_medications.csv'  # Output file path

# Load the medications data from the CSV file
df = pd.read_csv(input_file_path)

# Ensure the DESCRIPTION and REASONDESCRIPTION columns don't have leading/trailing spaces
df['DESCRIPTION'] = df['DESCRIPTION'].str.strip()
df['REASONDESCRIPTION'] = df['REASONDESCRIPTION'].str.strip()

# Create separate pivot tables for DESCRIPTION and REASONDESCRIPTION
description_pivot = df.pivot_table(
    index='PATIENT',  # Rows represent unique patients
    columns='DESCRIPTION',  # Columns represent unique medications
    values='DISPENSES',  # Values are DISPENSES
    aggfunc='sum',  # Aggregate by summing DISPENSES if needed
    fill_value=0  # Fill missing values with 0
)

reason_description_pivot = df.pivot_table(
    index='PATIENT',  # Rows represent unique patients
    columns='REASONDESCRIPTION',  # Columns represent unique reasons
    values='DISPENSES',  # Values are DISPENSES
    aggfunc='sum',  # Aggregate by summing DISPENSES if needed
    fill_value=0  # Fill missing values with 0
)

# Flatten the column index
description_pivot.columns = [f"DESCRIPTION_{col}" for col in description_pivot.columns]
reason_description_pivot.columns = [f"REASONDESCRIPTION_{col}" for col in reason_description_pivot.columns]

# Merge the two pivot tables on PATIENT
processed_df = description_pivot.merge(reason_description_pivot, on='PATIENT', how='outer').reset_index()

# Save the processed DataFrame to a new CSV file
processed_df.to_csv(output_file_path, index=False)

# View the processed DataFrame
print("Processed DataFrame:")
print(processed_df.head())

print(f"Processed data has been saved to {output_file_path}")


Processed DataFrame:
                                PATIENT  \
0  00049ee8-5953-4edd-a277-b9c1b1a7f16b   
1  000769a6-23a7-426e-a264-cb0e509b2da2   
2  00079a57-24a8-430f-b4f8-a1cf34f90060   
3  0008a63c-c95c-46c2-9ef3-831d68892019   
4  00093cdd-a9f0-4ad8-87e9-53534501f008   

   DESCRIPTION_0.25 ML Leuprolide Acetate 30 MG/ML Prefilled Syringe  \
0                                                  0                   
1                                                  0                   
2                                                  0                   
3                                                  0                   
4                                                  0                   

   DESCRIPTION_0.3 ML Enoxaparin sodium 100 MG/ML Prefilled Syringe  \
0                                                  0                  
1                                                  0                  
2                                                  0                  
3  

**Observations**

In [10]:
import pandas as pd

# Define the input and output file paths
input_file_path = '10k_synthea_covid19_csv/observations.csv'  # Input file path
output_file_path = 'processed_observations.csv'  # Output file path

# Load the observations data from the CSV file
df = pd.read_csv(input_file_path)

# View the first few rows to understand the structure of the data
print("Original DataFrame:")
print(df.head())

# Ensure the DESCRIPTION column doesn't have leading/trailing spaces
df['DESCRIPTION'] = df['DESCRIPTION'].str.strip()

# Handle duplicates: Check if any patient has the same DESCRIPTION more than once
duplicates = df[df.duplicated(subset=['PATIENT', 'DESCRIPTION'], keep=False)]
if not duplicates.empty:
    print("\nDuplicate entries (patient ID with the same DESCRIPTION):")
    print(duplicates[['PATIENT', 'DESCRIPTION']])

# Create a pivot table where each row is a unique patient, and columns are the descriptions
pivot_df = df.pivot_table(index='PATIENT', columns='DESCRIPTION', values='VALUE', aggfunc='first')

# Replace NaN values with 0 (for missing descriptions for a patient)
pivot_df = pivot_df.fillna(0)

# Flatten the columns to remove multi-index from pivot table (if any)
pivot_df.columns = [col for col in pivot_df.columns]

# Save the resulting DataFrame to a new CSV file
pivot_df.to_csv(output_file_path, index=True)

# Print out the processed DataFrame
print("\nProcessed DataFrame (grouped by patient and description values):")
print(pivot_df.head())

# Create a legend table for descriptions and units
# Ensure that the UNIT column exists before creating the legend table
if 'UNITS' in df.columns:
    legend_df = df[['DESCRIPTION', 'UNITS']].drop_duplicates()
else:
    print("\n'UNIT' column not found. Available columns are:")
    print(df.columns)

# Save the legend table to a new CSV file
legend_output_file_path = 'observation_legend.csv'
if 'UNITS' in df.columns:
    legend_df.to_csv(legend_output_file_path, index=False)
    print(f"\nLegend table has been saved to {legend_output_file_path}")
else:
    print("No 'UNITS' column found, so legend table is not created.")

print(f"Processed data has been saved to {output_file_path}")


Original DataFrame:
         DATE                               PATIENT  \
0  2019-08-01  f0f3bc8d-ef38-49ce-a2bd-dfdda982b271   
1  2019-08-01  f0f3bc8d-ef38-49ce-a2bd-dfdda982b271   
2  2019-08-01  f0f3bc8d-ef38-49ce-a2bd-dfdda982b271   
3  2019-08-01  f0f3bc8d-ef38-49ce-a2bd-dfdda982b271   
4  2019-08-01  f0f3bc8d-ef38-49ce-a2bd-dfdda982b271   

                              ENCOUNTER     CODE  \
0  6a74fdef-2287-44bf-b9e7-18012376faca   8302-2   
1  6a74fdef-2287-44bf-b9e7-18012376faca  72514-3   
2  6a74fdef-2287-44bf-b9e7-18012376faca  29463-7   
3  6a74fdef-2287-44bf-b9e7-18012376faca  77606-2   
4  6a74fdef-2287-44bf-b9e7-18012376faca   9843-4   

                                         DESCRIPTION VALUE    UNITS     TYPE  
0                                        Body Height  82.7       cm  numeric  
1  Pain severity - 0-10 verbal numeric rating [Sc...   2.0  {score}  numeric  
2                                        Body Weight  12.6       kg  numeric  
3                  W

**PATIENTS**

In [9]:
from datetime import datetime

# Load the file
file_path = '10k_synthea_covid19_csv/patients.csv'
df = pd.read_csv(file_path)

# Current date for age calculation
current_date = datetime.now()

# Remove specified columns, including STATE
columns_to_remove = [
    'SSN', 'DRIVERS', 'PASSPORT', 'PREFIX', 'SUFFIX', 'ZIP',
    'LON', 'LAT', 'HEALTHCARE_EXPENSES', 'HEALTHCARE_COVERAGE', 
    'ADDRESS', 'CITY', 'COUNTY', 'STATE'
]
df_cleaned = df.drop(columns=columns_to_remove, errors='ignore')

# Encode categorical columns, replacing STATE with BIRTHPLACE
categorical_columns = ['RACE', 'MARITAL', 'ETHNICITY', 'GENDER', 'BIRTHPLACE']
encoding_legend = []

for column in categorical_columns:
    if column in df_cleaned.columns:
        unique_values = df_cleaned[column].dropna().unique()
        encoding_map = {value: idx + 1 for idx, value in enumerate(unique_values)}
        df_cleaned[column] = df_cleaned[column].map(encoding_map)
        for original_value, encoded_value in encoding_map.items():
            encoding_legend.append({'Column': column, 'Original Value': original_value, 'Encoded Value': encoded_value})

# Convert BIRTHDATE to age
def calculate_age(birthdate, death_date=None):
    birthdate = pd.to_datetime(birthdate, errors='coerce')
    if pd.isna(birthdate):
        return None
    if death_date:
        death_date = pd.to_datetime(death_date, errors='coerce')
        if not pd.isna(death_date):
            return (death_date - birthdate).days // 365
    return (current_date - birthdate).days // 365

df_cleaned['AGE'] = df_cleaned.apply(
    lambda row: calculate_age(row['BIRTHDATE'], row.get('DEATHDATE')), axis=1
)

# Encode DEATHDATE into binary (1 if exists, 0 otherwise)
df_cleaned['DECEASED'] = df_cleaned['DEATHDATE'].notna().astype(int)

# Drop BIRTHDATE and DEATHDATE as they are no longer needed
df_cleaned = df_cleaned.drop(columns=['BIRTHDATE', 'DEATHDATE'], errors='ignore')

# Save the resulting legend and cleaned data
encoding_legend_path = 'patient_legend.csv'
cleaned_data_path = 'processed_patients.csv'

# Create and save the encoding legend DataFrame
legend_df = pd.DataFrame(encoding_legend)
legend_df.to_csv(encoding_legend_path, index=False)

# Save the cleaned data
df_cleaned.to_csv(cleaned_data_path, index=False)

# Display the paths of saved files
print(f"Encoding legend saved to: {encoding_legend_path}")
print(f"Cleaned patient data saved to: {cleaned_data_path}")


Encoding legend saved to: patient_legend.csv
Cleaned patient data saved to: processed_patients.csv


In [None]:
""" import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

# Define chunk size
chunk_size = 100000  # Adjust based on memory

# Function to clean columns and rename
def clean_columns(df, prefix):
    df.columns = df.columns.str.strip()  # Strip column names of leading/trailing spaces
    if 'PATIENT' not in df.columns:
        raise ValueError(f"PATIENT column missing in {prefix} dataframe")
    df = df.rename(columns=lambda x: f"{prefix}_{x}" if x != "PATIENT" else x)
    return df

# Initialize an empty Dask dataframe for merging
merged_df = dd.from_pandas(pd.DataFrame(), npartitions=1)

# Function to filter and merge data for a single patient
def merge_patient_data(file_path, prefix, patient_id):
    chunk_list = []
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        # Clean and filter for the specific patient ID
        chunk = clean_columns(chunk, prefix)
        chunk = chunk[chunk['PATIENT'] == patient_id]  # Filter for the patient
        chunk_dd = dd.from_pandas(chunk, npartitions=4)
        chunk_list.append(chunk_dd)
    
    # Concatenate all chunks for the given patient
    patient_df = dd.concat(chunk_list, axis=0, interleave_partitions=True)
    return patient_df

# Specify the patient ID you want to process
patient_id_to_process = 'df6b563d-1ff4-4833-9af8-84431e641e9c'  # Example patient ID

# Process the allergies dataset
allergies_df = merge_patient_data('10k_synthea_covid19_csv/allergies.csv', 'allergies', patient_id_to_process)

# Process other datasets (you can adjust the file paths as needed)
careplans_df = merge_patient_data('10k_synthea_covid19_csv/careplans.csv', 'careplans', patient_id_to_process)
merged_df = dd.merge(allergies_df, careplans_df, on="PATIENT", how="outer")

conditions_df = merge_patient_data('10k_synthea_covid19_csv/conditions.csv', 'conditions', patient_id_to_process)
merged_df = dd.merge(merged_df, conditions_df, on="PATIENT", how="outer")

immunizations_df = merge_patient_data('10k_synthea_covid19_csv/immunizations.csv', 'immunizations', patient_id_to_process)
merged_df = dd.merge(merged_df, immunizations_df, on="PATIENT", how="outer")

medications_df = merge_patient_data('10k_synthea_covid19_csv/medications.csv', 'medications', patient_id_to_process)
merged_df = dd.merge(merged_df, medications_df, on="PATIENT", how="outer")

# Optional: observations dataset, if needed
observations_df = merge_patient_data('10k_synthea_covid19_csv/observations.csv', 'observations', patient_id_to_process)
merged_df = dd.merge(merged_df, observations_df, on="PATIENT", how="outer")

# Repartition the merged data for better memory handling
merged_df = merged_df.repartition(npartitions=10)

# Define output path
output_path = 'E:/Downloads/merged_patient_data.h5'

# Save to HDF5 format, ensuring it's a single file by using HDFStore
with ProgressBar():
    # Use pandas HDFStore to save to a single HDF5 file
    with pd.HDFStore(output_path, mode='w') as store:
        store.put('df', merged_df.compute(), index=False)

# If you want to monitor the progress or print the first few rows, compute the head
merged_df_computed = merged_df.head()
print(merged_df_computed)
 """

In [5]:
""" import pandas as pd
import dask.dataframe as dd


conditions_df = pd.read_csv('10k_synthea_covid19_csv/conditions.csv')
immunizations_df = pd.read_csv('10k_synthea_covid19_csv/immunizations.csv')
medications_df = pd.read_csv('10k_synthea_covid19_csv/medications.csv')
observations_df = pd.read_csv('10k_synthea_covid19_csv/observations.csv')
patients_df = pd.read_csv('10k_synthea_covid19_csv/patients.csv')
allergies_df = pd.read_csv(r"10k_synthea_covid19_csv/allergies.csv")
careplans_df = pd.read_csv('10k_synthea_covid19_csv/careplans.csv')

# Display the first few rows of each dataframe to understand their structure
{
    "allergies": allergies_df.head(),
    "careplans": careplans_df.head(),
    "conditions": conditions_df.head(),
    "immunizations": immunizations_df.head(),
    "medications": medications_df.head(),
    "observations": observations_df.head(),
    "patients": patients_df.head()
}
 """

' import pandas as pd\nimport dask.dataframe as dd\n\n\nconditions_df = pd.read_csv(\'10k_synthea_covid19_csv/conditions.csv\')\nimmunizations_df = pd.read_csv(\'10k_synthea_covid19_csv/immunizations.csv\')\nmedications_df = pd.read_csv(\'10k_synthea_covid19_csv/medications.csv\')\nobservations_df = pd.read_csv(\'10k_synthea_covid19_csv/observations.csv\')\npatients_df = pd.read_csv(\'10k_synthea_covid19_csv/patients.csv\')\nallergies_df = pd.read_csv(r"10k_synthea_covid19_csv/allergies.csv")\ncareplans_df = pd.read_csv(\'10k_synthea_covid19_csv/careplans.csv\')\n\n# Display the first few rows of each dataframe to understand their structure\n{\n    "allergies": allergies_df.head(),\n    "careplans": careplans_df.head(),\n    "conditions": conditions_df.head(),\n    "immunizations": immunizations_df.head(),\n    "medications": medications_df.head(),\n    "observations": observations_df.head(),\n    "patients": patients_df.head()\n}\n '

In [6]:
""" # Rename columns with Dask for proper annotation
allergies_df = allergies_df.rename(columns=lambda x: f"allergies_{x}" if x != "PATIENT" else x)
careplans_df = careplans_df.rename(columns=lambda x: f"careplans_{x}" if x != "PATIENT" else x)
conditions_df = conditions_df.rename(columns=lambda x: f"conditions_{x}" if x != "PATIENT" else x)
immunizations_df = immunizations_df.rename(columns=lambda x: f"immunizations_{x}" if x != "PATIENT" else x)
medications_df = medications_df.rename(columns=lambda x: f"medications_{x}" if x != "PATIENT" else x)
observations_df = observations_df.rename(columns=lambda x: f"observations_{x}" if x != "PATIENT" else x)

# Merge datasets using Dask
merged_df = dd.merge(allergies_df, careplans_df, on="PATIENT", how="outer")
merged_df = dd.merge(merged_df, conditions_df, on="PATIENT", how="outer")
merged_df = dd.merge(merged_df, immunizations_df, on="PATIENT", how="outer")
merged_df = dd.merge(merged_df, medications_df, on="PATIENT", how="outer")
merged_df = dd.merge(merged_df, observations_df, on="PATIENT", how="outer")

# Compute the result to load into memory (you can persist or save it if needed)
merged_df_computed = merged_df.compute()

# Display the first few rows of the computed dataframe to the user
merged_df_computed.head()
 """

' # Rename columns with Dask for proper annotation\nallergies_df = allergies_df.rename(columns=lambda x: f"allergies_{x}" if x != "PATIENT" else x)\ncareplans_df = careplans_df.rename(columns=lambda x: f"careplans_{x}" if x != "PATIENT" else x)\nconditions_df = conditions_df.rename(columns=lambda x: f"conditions_{x}" if x != "PATIENT" else x)\nimmunizations_df = immunizations_df.rename(columns=lambda x: f"immunizations_{x}" if x != "PATIENT" else x)\nmedications_df = medications_df.rename(columns=lambda x: f"medications_{x}" if x != "PATIENT" else x)\nobservations_df = observations_df.rename(columns=lambda x: f"observations_{x}" if x != "PATIENT" else x)\n\n# Merge datasets using Dask\nmerged_df = dd.merge(allergies_df, careplans_df, on="PATIENT", how="outer")\nmerged_df = dd.merge(merged_df, conditions_df, on="PATIENT", how="outer")\nmerged_df = dd.merge(merged_df, immunizations_df, on="PATIENT", how="outer")\nmerged_df = dd.merge(merged_df, medications_df, on="PATIENT", how="outer")\

In [None]:
""" output_path = 'merged_patient_data.h5'  # Update this path as needed

# Load the HDF5 file into a Pandas DataFrame
df = pd.read_hdf(output_path, key='df')

# View the first few rows of the DataFrame
csv_output_path = 'merged_patient_data.csv'  # Update this path as needed

# Save the DataFrame to CSV
df.to_csv(csv_output_path, index=False)
 """