# Consolidate data (Batch1)

<div style="text-align: right"> Author: Vassil Dimitrov </div>
<div style="text-align: right"> Date: 2023-08-01 </div>

Batch 1 of the data will be prepared by merging the patients_batch1 data with the corresponding patient data from all other available tables. Note that some preprocessing will be required for some of the tables encluding `medications`, `observations` and `procedures`.

## Prep

### Load libraries

In [None]:
import numpy as np
import pandas as pd

---

### Load patients data

In [None]:
b = pd.read_pickle('patients_batch1.pkl')

In [None]:
b.shape

In [None]:
display(b.head())

Obtain column names from the patients dataframe for downstream analyses.

In [None]:
patients_dat_cols = list(b.columns)
print(patients_dat_cols)

---

## Functions

### `PATIENT` to index & explode

In [None]:
def patient_subset_index_explode(df1, df2):
    # Subset data
    print('subsetting...')
    tab = df2[df2['PATIENT'].isin(df1.index)]
    del df2
    print('Subset table dimensions:', tab.shape)
   
    # Create a new column 'encounter' to differentiate between duplicate rows for each patient
    print('Counting number of encounters per patient...')
    tab['encounter_n'] = tab.groupby('PATIENT').cumcount()+1
    print('Pivoting table and setting "PATIENT" as index')
    tab = tab.pivot_table(index='PATIENT',
                          columns='encounter_n',
                          aggfunc='first')
    print(tab.shape)
    # Flatten the MultiIndex columns
    tab.columns = [f'{col[0]}_{col[1]}' for col in tab.columns]
    # Fill in nulls with 0
    print('Filling in nulls with 0s')
    tab.fillna(0, inplace=True)
    # Convert to uint32
    tab = tab.astype('uint32')
    # Return processed and subset table
    return tab

### Batch merge

In [None]:
def batch_merge (df1, tab):
    # Merge the data with consolidated dataframe
    print('Merging...')
    df1 = df1.merge(tab, 
                  left_index=True, 
                  right_index=True, 
                  how='left')
    print('Merged!')
    print('New dimensions:', df1.shape)
    return df1

### Load data

In [None]:
def load_table (tab_name):
    to_merge = pd.read_pickle(tab_name)
    print(f'Dimensions for {tab_name}:')
    print(to_merge.shape)
    return to_merge

### Subset batch

In [None]:
def subset_batch_merge (df1, df2):
    print('Merging...')
    df_b = df2[df2.index.isin(df1.index)]
    df1 = df1.merge(df_b, 
                  left_index=True, 
                  right_index=True, 
                  how='left')
    print('Merged!')
    return df1

### Tidy up

In [None]:
def tidy_up (df, cols2exclude):
    print('Filling up with 0s')
    df = df.fillna(0)
    cols = [col for col in list(df.columns) if col not in cols2exclude]
    for col in cols:
        print(f'Processing column {col}...')
        if pd.api.types.is_sparse(df[col]):
            df[col] = df[col].sparse.to_dense().astype(np.uint32)
        else:
            df[col] = df[col].astype(np.uint32)
        print(f'Processed column {col}.')
    return df

### Process and merge
Uses the three other functions defined above: `load_table`, `subset_batch_merge` and `tidy_up`.

In [None]:
def process_n_merge_processed (df_a, tab_name_b):
    cols2exclude=list(df_a.columns)
    df_b = load_table(tab_name_b)
    df_a = subset_batch_merge(df_a, df_b)
    df_a = tidy_up(df_a, cols2exclude)
    return df_a

---
---

---
---

## Allergies

`allergies` is already preprocessed so it will essentially be loaded, the corresponding patients will be identified based on the `patients_batch1` information, and their respecive data will be merged to the patients data based on the indexes of both tables, keeping all batch1 patients.

### Read table

In [None]:
to_merge = pd.read_pickle('allergies_encounters.pkl')
# Sanity check
print('dimensions:', to_merge.shape)
display(to_merge.head())

### Subset for batch1 patients only

In [None]:
to_merge_b = to_merge[to_merge.index.isin(b.index)]
# Sanity check
print('dimensions:', to_merge_b.shape)
display(to_merge_b.head())

A bit over 12,000 patients from batch1 also have information about allergies.

### Merge `allergies` to batch1

In [None]:
batch = b.merge(to_merge_b, 
                  left_index=True, 
                  right_index=True, 
                  how='left')
# Sanity check:
print(batch.shape)
display(batch.head())

In [None]:
# Clean up memory
del b, to_merge, to_merge_b

### Tidy up table

Replace nulls with 0 (no allergy encounter with a physician).

In [None]:
batch.fillna(0, inplace=True)
display(batch.head())

Check the variable types for each column.

In [None]:
batch.dtypes

The allergy types were transformed into `float64` type. They will be converted to `uint32` to save memory.

In [None]:
for col in batch.columns:
    if col not in patients_dat_cols:
        print('Converting', col)
        batch[col] = batch[col].astype('uint32')
    else:
        continue

In [None]:
batch.dtypes

The types for the merged dataframe are of appropriate value.  
  
  The column names will be saved to exclude for downstream data manipulation.

In [None]:
cols2exclude = list(batch.columns)
print(cols2exclude)

---
---

## Careplans

In [None]:
careplans = load_table('careplans_encounters.pkl')

In [None]:
careplans.dtypes

In [None]:
print(batch.shape)
cols2exclude = list(batch.columns)
batch = subset_batch_merge(batch, careplans)
print(batch.shape)

In [None]:
batch.dtypes

In [None]:
batch = tidy_up(batch, cols2exclude)
batch.dtypes

In [None]:
print('Dimensions so far:', batch.shape)

In [None]:
# Clean up memory
del careplans, cols2exclude

---
---

## Conditions

In [None]:
batch = process_n_merge_processed (batch, 'conditions_encounters.pkl')
print('Dimensions so far:', batch.shape)

---
---

## Immunizations

In [None]:
batch = process_n_merge_processed (batch, 'immunizations_encounters.pkl')
print('Dimensions so far:', batch.shape)

---
---

## Save

Delete the batch table so far to liberate memory for processing the next table - `observations`.

In [None]:
batch.to_pickle('batch1.pkl')
del batch

## Observations

In [None]:
observations = load_table('observations_encounters.pkl')

In [None]:
display(observations.head())

In [None]:
batch = pd.read_pickle('batch1.pkl')

In [None]:
cols2exclude = list(batch.columns)

In [None]:
df2 = patient_subset_index_explode(batch, observations)

In [None]:
del observations

In [None]:
print('Current consolidated table dimensions:', batch.shape)

In [None]:
batch = batch_merge (batch, df2)

In [None]:
del df2

In [None]:
batch = tidy_up (batch, cols2exclude)

In [None]:
batch.dtypes

In [None]:
batch.to_pickle('batch1.pkl')

---
---

## Procedures

In [140]:
procedures = pd.read_pickle('procedure_encounters.pkl')

In [141]:
display(procedures.head())

Unnamed: 0,Admission to burn unit,Admission to long stay hospital,Admission to neurosurgical department,Admission to orthopedic department,Admission to trauma surgery department,Allergy screening test,Amputation of left arm,Amputation of left foot,Amputation of left hand,Amputation of left leg,...,Surgical manipulation of joint of knee,Surgical manipulation of shoulder joint,Suture open wound,Thoracentesis (procedure),Throat culture (procedure),Total knee replacement,Total replacement of hip,Transplant of lung (procedure),Vasectomy,PATIENT
1516584,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,33f33990-ae8b-4be8-938f-e47ad473abfe
1600279,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,36d131ee-dd5b-4acb-acbe-19961c32c099
1516585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,33f33990-ae8b-4be8-938f-e47ad473abfe
1600280,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,36d131ee-dd5b-4acb-acbe-19961c32c099
2987976,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,660bec03-9e58-47f2-98b9-2f1c564f3838


In [142]:
batch = pd.read_pickle('batch1.pkl')

In [143]:
cols2exclude = list(batch.columns)

In [None]:
df2 = patient_subset_index_explode(batch, procedures)

subsetting...
Subset table dimensions: (562787, 84)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tab['encounter_n'] = tab.groupby('PATIENT').cumcount()+1


In [None]:
del procedures

In [None]:
print('Current consolidated table dimensions:', batch.shape)

In [None]:
batch = batch_merge (batch, df2)

In [None]:
del df2

In [None]:
batch = tidy_up (batch, cols2exclude)

In [None]:
batch.dtypes

In [None]:
batch.to_pickle('batch1.pkl')