# Data cleaning of height, weight, BMI

1 March 2023

---

## Description

Notebook that carries out the cleaning steps for the height, weight and BMI observations in the NPS data.

## Steps

### 1. Initial cleaning of weights

- Removing non-numeric characters
- Corrections based on units

### 2. Initial cleaning of heights

- Removing non-numeric characters
- Corrections based on units

### 3. Initial cleaning of BMI

- Removing non-numeric characters

### 4. Extreme weights

- All those that are > 400

### 5. Extreme heights

- All those that are > 300 or < 140

### 6. Extreme BMI

- This is incomplete as it is not yet neccessary

### 7. Combined height, weight, BMI cleaning

- Correct for errors where the GP has interchanged weight and height values, resulting in inconsistent BMI values.

### 8. Any final cleaning

- Scale weights < 15

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import re

sns.set()
%matplotlib inline

## Config

In [2]:
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 100)

## Load up raw data

In [3]:
source_folder2 = "M:/Working/AL/projects/weight_loss/outputs"
source_folder = 'M:/Working/DataAnalysis/CleanAndStructure/DataFiles'

filename_observation = "NPS_Observation_202005.parquet"
filename_patient = "NPS_Patient_202107.parquet"
filename_uwl_cohort = "uwl_cohort_020322.parquet"

filename_observation = "NPS_Observation_202005.csv"
filename_patient = "NPS_Patient_202107.csv"
filename_uwl_cohort = "uwl_cohort_020322.csv"

In [4]:
#observation = pd.read_parquet(f'{source_folder}/{filename_observation}')
observation = pd.read_csv(f'{source_folder}/{filename_observation}')

In [2]:
#patient = pd.read_parquet(f'{source_folder}/{filename_patient}')
patient = pd.read_csv(f'{source_folder}/{filename_patient}')

In [6]:
#cohort = pd.read_parquet(f'{source_folder2}/{filename_uwl_cohort}')

In [7]:
# include the usi in the observations
patientid_to_usi = dict(patient[['patientid', 'usi']].values)
observation['usi'] = observation['patientid'].apply(lambda p: patientid_to_usi[p])

In [8]:
# restrict the observations to the UWL cohort for now
#usi_uwl = cohort['usi'].unique()
#observation = observation[observation['usi'].isin(usi_uwl)].reset_index().iloc[:, 1:]

## Filter to weight, height and BMI observations

In [9]:
weights = observation[observation['observation_type'] == 'WEIGHT']
heights = observation[observation['observation_type'] == 'HEIGHT']
bmis = observation[observation['observation_type'] == 'BMI']

## 1. Cleaning weights

Remove unnecessary characters to get numeric values. Use the regular expressions defined below in  ```patterns_weight```. The types of corrections that these make are described on the confluence documentation, but overall are to remove unncessary characters, correct unusual decimal points and return only numeric values.

In [51]:
patterns_weight = {r'(.)[^0-9\.]+$': '\g<1>',
                   r"^([0-9]+)\s?[\,\:\;\'\`]\s?([0-9]+)$": '\g<1>.\g<2>',
                   r'^([0-9]+)\-{1,2}([0-9]+)$': '\g<1>', 
                   r'^([0-9]+)[\.\,\s/\(\)]+([0-9]+)$': '\g<1>.\g<2>', 
                   r'^([0-9]+)[\.\,\s/]+$': '\g<1>', 
                   r'^[^0-9]+([0-9]+)$': '\g<1>',
                   r'^[^0-9]+$': '0', 
                   r"^([0-9]+)[\.\,\;\:\/=abc\-\'\`]+([0-9]+)": '\g<1>.\g<2>', 
                   r'^[^0-9]+([0-9]+)\.([0-9]+)[^0-9]+$': '\g<1>.\g<2>',
                   r'^[^0-9]+([0-9]+)\.([0-9]+)$': '\g<1>.\g<2>', 
                   r'^([0-9]+)\.([0-9]+)[^0-9]+$': '\g<1>.\g<2>', 
                   r'^([0-9]+)[^0-9]+$': '\g<1>', 
                   r'^([0-9]+)\s?kg.': '\g<1>', 
                   r'^([0-9])\.([0-9])\.([0-9])$': '\g<1>0\g<2>.\g<3>',
                   r'^([0-9])\s([0-9])(.)': '\g<1>\g<2>\g<3>',
                   r'^([0-9])\+([0-9])$': '\g<1>\g<2>',
                   r'^8st10$': '51',
                   r'^([0-9])t([0-9])$': '\g<1>\g<2>',
                   r'^([0-9]+)\.l([0-9]+)$': '\g<1>.\g<2>',
                   r'6y2': '0',
                   r'\+n\s[0-9]': '0'}

In [99]:
# create a new column for cleaned data
weights.loc[:, 'observation_value_cleaned'] = weights.loc[:, 'observation_value']

In [100]:
# carry out replacements based on regexes
for key, value in patterns_weight.items():
    weights.loc[:, 'observation_value_cleaned'] = weights.loc[:, 'observation_value_cleaned'].str.replace(key, value, regex=True)

In [101]:
# replace empty values with 0 so these can be converted to floats
weights.loc[:, 'observation_value_cleaned'] = weights['observation_value_cleaned'].str.replace(r'^$', '0', regex=True)

In [102]:
# convert to float
weights.loc[:, 'observation_value_cleaned'] = weights.loc[:, 'observation_value_cleaned'].astype(float)

## 2. Cleaning heights

- Remove unncessary characters
- Convert units where neccessary

In [97]:
def is_numeric(value):
    """
    Check to see if value can be converted to float
    """
    try:
        a = float(value)
    except:
        return False
    else:
        return True
    

def convert_imperial(value, imperial_patterns):
    """
    Convert a value in imperial units to metric
    
    Args
    ----
    value (str): string representing imperial value (in feet)
    
    Return
    ------
    value_tidy (str): string representing metric value (in cm)
    """
    value_tidy = value
    
    for pattern, replacement in imperial_patterns.items():
        value_tidy = re.sub(pattern, replacement, value_tidy)
        
    feet, inches = value_tidy.split('.')
    value_tidy = float(feet)*30.48 + float(inches)*2.5
    return value_tidy


# patterns to tidy up data that is metric but has characters
patterns_height_metric = {r'^([0-9]{3})\s{0,1}c{0,1}m{0,1}s{0,1}': '\g<1>', 
                          r'^([0-9]{3}\.[0-9])\s{0,1}cm{0,1}s{0,1}': '\g<1>',
                          r'^[^0-9]+$': '0', 
                          r'^([0-9]{3})\,([0-9])$': '\g<1>.\g<2>', 
                          r'^([0-9]{3}).+': '\g<1>', 
                          r'^\@\s{0,1}([0-9]{3})$': '\g<1>', 
                          r'^\`([0-9]{3})$': '\g<1>',
                          r'^(1)\.([0-9]{1,2,3})\s{0,1}m{0,1}$': '\g<1>\g<2>', 
                          r'1\`([0-9]{2})[^0-9]*': '1\g<1>', 
                          r'^1\s{0,1}m([0-9]{2})\s{0,1}c{0,1}m{0,1}$': '1\g<1>', 
                          r'^([0-9]{2})\s{0,1}cm{0,1}s{0,1}$': '1\g<1>', 
                          r'^([0-9])\.([0-9]{1})\s{0,1}m$': '\g<1>\g<2>0', 
                          r'^([0-9])\.([0-9]{2})\s{0,1}m$': '\g<1>\g<2>'} 


# tidy up data in imperial units that has characters
patterns_height_imperial = {r'^([0-9])\'\s{0,1}([0-9]{1,2})\s{0,1}\'{0,1}$': '\g<1>.\g<2>',
                            r'^([0-9])\`([0-9]{1,2})\`{0,1}$': '\g<1>.\g<2>',
                            r'^([0-9])\s{0,1}ft\s{0,1}[^0-9]+$': '\g<1>.0',
                            r'^([0-9])\s{0,1}ft\s{0,1}([0-9]).*$': '\g<1>.\g<2>',
                            r'^([0-9])\s{0,1}/([0-9]).*$': '\g<1>.\g<2>', 
                            r'^([0-9])\s([0-9]{1,2})$': '\g<1>.\g<2>', 
                            r'^([0-9])\s{0,1}-\s{0,1}([0-9]{1,2})$': '\g<1>.\g<2>', 
                            r'^([0-9])\s{0,1}\.\s{0,1}([0-9]{1,2})\"$': '\g<1>.\g<2>', 
                            r'^([0-9])\s{0,1}(\-|\.|\,|\')\s{0,1}([0-9]{1,2})(\"{0,1}|\'\')$': '\g<1>.\g<3>', 
                            r'^([0-9])\s{0,1}ft$': '\g<1>.0', 
                            r'^([0-9])\s{0,1}(ft|f)\s{0,1}([0-9]{1,2})$': '\g<1>.\g<3>', 
                            r'^([0-9])\s{0,1}\.\s{0,1}([0-9]{1,2})(ft|\'|\"|\`|in)$': '\g<1>.\g<2>',  
                            r'^([0-9])\'$': '\g<1>.0', 
                            r'^([0-9]{2})\s{0,1}inches$': '0.\g<1>', 
                            r'^([456])\'([0-9])\.5$': '\g<1>.\g<2>' 
                           }


# additional transformations for cleaning
patterns_height = {r'([0-9]+)[^0-9\.]+$': '\g<1>', # number followed by non-numbers
            r'^([0-9]+)\s?[\,\:\;\'\`]\s?([0-9]+)$': '\g<1>.\g<2>', # decimals with another character rather than decimal
            r'^([0-9]+)\-{1,2}([0-9]+)$': '\g<1>', # integers followed by trailing - characters then number
            r'^([0-9]+)[\.\,\s/]+([0-9]+)$': '\g<1>.\g<2>', # decimals with multiple decimal point like characters
            r'^([0-9]+)[\.\,\s/]+$': '\g<1>', # decimals with multiple decimal point like characters
            r'^[^0-9]+([0-9]+)$': '\g<1>', # letters followed by integer
            r'^[^0-9]+$': ''} # no numbers in expression


# regex to determine if a number is in imperial units
pattern_height_imperial = r'|'.join(patterns_height_imperial.keys())

In [103]:
# create a new column to store the cleaned values
heights.loc[:, 'observation_value_cleaned'] = heights.loc[:, 'observation_value']

In [104]:
# number of non-numeric ones
heights['observation_value_cleaned'].apply(lambda v: is_numeric(v) == False).sum()

In [105]:
# filter to non-numeric values
non_numeric1 = heights[heights['observation_value'].apply(lambda v: is_numeric(v) == False)]

# clean the first set of non-numeric: lowercase, remove additional spaces, 
non_numeric1.loc[:, 'observation_value_cleaned'] = non_numeric1.loc[:, 'observation_value'].str.lower().str.strip().str.replace(r'\s+', ' ', regex=True)
    
for pattern, replacement in patterns_height_metric.items():
    non_numeric1.loc[:, 'observation_value_cleaned'] = non_numeric1.loc[:, 'observation_value_cleaned'].str.replace(pattern, 
                                                                                                             replacement, regex=True)
    
# update the height values based on the first cleaned set
heights.loc[non_numeric1.index, 'observation_value_cleaned'] = non_numeric1.loc[:, 'observation_value_cleaned']

In [106]:
# how many are left to clean?
heights['observation_value_cleaned'].apply(lambda v: is_numeric(v) == False).sum()

In [107]:
# clean the second set of non-numeric (based on the values being in imperial)
non_numeric2 = heights[heights['observation_value_cleaned'].apply(lambda v: 
                                                                       is_numeric(v) == False)]

non_numeric2.loc[:, 'is_imperial'] = False
non_numeric2.loc[:, 'is_imperial'] = non_numeric2['observation_value_cleaned'].str.match(pattern_height_imperial)
non_numeric2 = non_numeric2[non_numeric2['is_imperial'] == True]

In [108]:
# update the height values with cleaned ones (converted from imperial to metric)
heights.loc[non_numeric2.index, 'observation_value_cleaned'] = non_numeric2.loc[:, 'observation_value_cleaned'].apply(lambda v: 
                                                                                                            convert_imperial(v, patterns_height_imperial))

In [109]:
# how many left to clean?
heights['observation_value_cleaned'].apply(lambda v: is_numeric(v) == False).sum()

In [65]:
# set the 'kg' ones to HEIGHT values
height_is_weight = r'^[0-9]+\s{0,1}kg$'

non_numeric3 = heights[heights['observation_value_cleaned'].apply(lambda v: is_numeric(v) == False)]
non_numeric3 = non_numeric3[non_numeric3['observation_value_cleaned'].str.contains(height_is_weight, regex=True)]
non_numeric3.loc[:, 'observation_value_cleaned'] = non_numeric3['observation_value_cleaned'].str.replace('kg', '').str.strip()

heights.loc[non_numeric3.index, 'observation_value_cleaned'] = non_numeric3['observation_value_cleaned']
heights.loc[non_numeric3.index, 'observation_type'] = "WEIGHT"

In [110]:
# fix remaining values
non_numeric4 = heights[heights['observation_value_cleaned'].apply(lambda v: is_numeric(v) == False)]

for pattern, replacement in patterns_height.items():
    non_numeric4.loc[:, 'observation_value_cleaned'] = non_numeric4.loc[:, 'observation_value_cleaned'].str.replace(pattern, 
                                                                                                             replacement, regex=True)
# update the height values with cleaned ones
heights.loc[non_numeric4.index, 'observation_value_cleaned'] = non_numeric4.loc[:, 'observation_value_cleaned']

In [67]:
# assume the remainder are junk - set to 0
non_numeric5 = heights[heights['observation_value_cleaned'].apply(lambda v: is_numeric(v) == False)]
heights.loc[non_numeric5.index, 'observation_value_cleaned'] = 0

In [111]:
# convert to float 
heights.loc[:, 'observation_value_cleaned'] = heights['observation_value_cleaned'].astype(float)

## 3. Clean BMI

In [112]:
# split into numeric and non-numeric
# how many non-numeric?
bmis[bmis['observation_value'].apply(lambda v: is_numeric(v) == False)].shape[0]

In [70]:
# ignore these bmis (all of the form "**.* or similar)
bmis_nice = bmis[bmis['observation_value'].apply(lambda v: is_numeric(v))]

## 4. Transform extreme weights

- Weights > 400

In [71]:
def correct_large_weights(weight):
    """
    For those weights > 400. Based on inspecting these
    likely due to additional factor of 10, 100 or 1000
    """
    if 400 < weight < 3000:
        weight_new = weight / 10
    elif 3000 < weight < 30000:
        weight_new = weight / 100
    elif 30000 < weight < 99999:
        weight_new = weight / 1000
    else:
        weight_new = weight
    return weight_new

In [113]:
weights.loc[:, 'observation_value_cleaned'] = weights.loc[:, 'observation_value_cleaned'].apply(lambda w: correct_large_weights(w))

In [114]:
weights['observation_value_cleaned'].agg([np.min, np.max])

In [39]:
# remove nan values
weights = weights[weights.observation_value_cleaned.isnull() == False]

In [115]:
np.quantile(weights['observation_value_cleaned'].values, [0.01, 0.05, 0.1, 0.2, 0.25, 0.5, 0.75, 0.8, 0.9, 0.95, 0.99])

## 5. Transform extreme heights

In the following order:

- Heights < 2.5
- Heights > 300
- Heights < 140

In [41]:
# convert those that are likely in metres
bad_in_metres = heights[heights['observation_value_cleaned'] < 2.5]['observation_value_cleaned']
heights.loc[bad_in_metres.index, 'observation_value_cleaned'] = bad_in_metres * 100

In [116]:
heights['observation_value_cleaned'].agg([np.min, np.max])

In [45]:
# remove nan values
heights = heights[heights.observation_value_cleaned.isnull() == False]

In [117]:
np.quantile(heights['observation_value_cleaned'].values, 
            [0.001, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.8, 0.9, 0.95, 0.99, 0.999])

In [118]:
# how many are greater than 300?
(heights['observation_value_cleaned'] > 300).sum()

In [119]:
# what do these look like?
heights[heights['observation_value_cleaned'] > 300].sort_values(by=['observation_value_cleaned'])

In [49]:
# correct large height values
def correct_large_heights(value):
    value = int(value)
    if value < 1140:
        value_new = value / 10
    else:
        if (len(str(value)) == 4) & (str(value)[:2] in ['11', '12']):
            value_new = ''.join([str(value)[0], str(value)[2:]])
        elif (len(str(value)) == 4) & (str(value)[:2] in ['14', '15', '16', '17', '18', '19', '21']):
            value_new = value / 10
        elif (len(str(value)) == 5) & (str(value)[0] in ['1']):
            value_new = value / 100
        elif (len(str(value)) == 5) & (str(value)[0] in ['3', '4', '5', '6', '7',' 8',' 9']):
            value_new = value / 1000
        else:
            value_new = value
    return value_new

In [120]:
heights['observation_value_cleaned'] = heights['observation_value_cleaned'].astype(float)

extreme_values = heights[heights['observation_value_cleaned'] > 300].index
heights.loc[extreme_values, 'observation_value_cleaned'] = heights.loc[extreme_values, 'observation_value_cleaned'].apply(lambda v: correct_large_heights(v))

## 6. Transform extreme BMI values

- Those less than 5 or greater than 60
- *This is currently not used*

In [51]:
bmi_values = bmis_nice['observation_value'].astype(float)

In [121]:
bmi_values.agg([np.min, np.max, np.median])

In [122]:
np.quantile(bmi_values, [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

In [54]:
# how many extreme? < 15 or greater than 60?
bmi_small = bmi_values[(bmi_values < 15)]
bmi_large = bmi_values[(bmi_values > 70)]

In [123]:
bmi_large.shape[0], bmi_small.shape[0]

In [124]:
bmis_nice['observation_value_cleaned'] = bmis_nice['observation_value'].astype(float)

In [125]:
heights['observation_value_cleaned'] = heights['observation_value_cleaned'].astype(float)

In [126]:
# how many small heights are there?
(heights['observation_value_cleaned'] < 140).sum()

In [59]:
hwbmi_nice = pd.concat([heights, weights, bmis_nice])

In [60]:
hwbmi_nice['observation_value_cleaned'] = hwbmi_nice['observation_value_cleaned'].astype(float)

In [61]:
hwbmi_nice = hwbmi_nice.sort_values(by=['usi', 'observation_dte', 'observation_type'])

In [127]:
hwbmi_nice.query('observation_type == "HEIGHT"')['observation_value_cleaned'].agg([np.min, np.max, np.median, np.mean])

## 7. Correct interchanged heights and weights

Height corrections for those < 140cm

1. if two heights on same day, with BMI and weight also reported
    - Calculate BMI for each height, keep one closest to reported BMI
2. weight > 30, no sensible BMI
    - calculate BMI from height, weight. If > 60, remove
3. height and weight same day, no BMI
    - calculate BMI, if outside (15, 60) swap height and weight
4. At least 2 heights, 2 weights same day, with BMI
    - calculate BMI for each pair. Swap if outside (15, 60). Then keep pair closest to reported BMI
6. weight low but no height reported. 
    - calculate median height and weight over set of observations. Calculate BMI for this weight as a weight and height. If reasonable as height then rename as height.
   

### Cleaning functions

In [63]:
def calculate_bmi(weight, height):
    bmi = weight / (height**2)
    return bmi


def hw_candidates(height, weight):
    """
    Checks that values likely correspond to a pair of heights and weights
    """
    if (height >= 100) & (weight >= 30):
        return True
    elif (height > 30) & (weight >= 100):
        return True
    else:
        return False

In [64]:
def height_weight_comparison_same_day(observation_day):
    """
    Compare the heights and weights for a given patient on the same day
    Handles the case where there is at least one weight and height observation
    and BMI is optional
    
    Args
    -----
    observation_day (pd.DataFrame): input dataframe for a patient on a particular day
    
    Returns
    -------
    obs_final (pd.DataFrame): the cleaned up dataframe
    """
    heights_day = observation_day.query('observation_type == "HEIGHT"')
    weights_day = observation_day.query('observation_type == "WEIGHT"')
    bmis_day = observation_day.query('observation_type == "BMI"')
    
    num_heights = heights_day.shape[0]
    num_weights = weights_day.shape[0]
    num_bmis = bmis_day.shape[0]
    
    # if there is a BMI use that to compare heights and weights are valid
    if (num_heights >= 1) & (num_weights >= 1) & (num_bmis == 1):
        bmi_value = bmis_day['observation_value_cleaned'].values[0]
        
        for n, m in ((num1, num2) for num1 in range(0, num_heights) for num2 in range(0, num_weights)):
            heights_temp = heights_day.iloc[n:n+1]
            weights_temp = weights_day.iloc[m:m+1]
            h_index = heights_temp.index
            w_index = weights_temp.index
            height_value = heights_temp['observation_value_cleaned'].values[0]
            weight_value = weights_temp['observation_value_cleaned'].values[0]
            bmi_temp = calculate_bmi(weight_value, height_value / 100)
            bmi_rev_temp = calculate_bmi(height_value, weight_value / 100)
            
            # heights and weights should be plausible
            if hw_candidates(height_value, weight_value):
                if np.round(bmi_temp) == np.round(bmi_value):
                    obs_final = pd.concat([heights_temp, weights_temp, bmis_day])
                    break
                elif np.round(bmi_rev_temp) == np.round(bmi_value):
                    weights_temp.loc[:, 'observation_type'] = "HEIGHT"
                    heights_temp.loc[:, 'observation_type'] = "WEIGHT"
                    obs_final = pd.concat([heights_temp, weights_temp, bmis_day])
                    break
                else:
                    obs_final = pd.concat([heights_temp, weights_temp, bmis_day])
            else:
                obs_final = pd.concat([heights_temp, weights_temp, bmis_day])
    
    elif (num_heights >= 1) & (num_weights >= 1) & (num_bmis == 0):
        for n, m in ((num1, num2) for num1 in range(0, num_heights) for num2 in range(0, num_weights)):
            heights_temp = heights_day.iloc[n:n+1]
            weights_temp = weights_day.iloc[m:m+1]
            h_index = heights_temp.index
            w_index = weights_temp.index
            height_value = heights_temp['observation_value_cleaned'].values[0]
            weight_value = weights_temp['observation_value_cleaned'].values[0]
            bmi_temp = calculate_bmi(weight_value, height_value / 100)
            bmi_rev_temp = calculate_bmi(height_value, weight_value / 100)
            
            # heights and weights should be plausible
            if hw_candidates(height_value, weight_value):
                if 15 < bmi_temp < 60:
                    obs_final = pd.concat([heights_temp, weights_temp])
                    break
                elif 15 < bmi_rev_temp < 60:
                    weights_temp.loc[:, 'observation_value_cleaned'] = height_value
                    heights_temp.loc[:, 'observation_value_cleaned'] = weight_value
                    obs_final = pd.concat([heights_temp, weights_temp])
                    break
                else:
                    obs_final = pd.concat([heights_temp, weights_temp])     
            else:
                obs_final = pd.concat([heights_temp, weights_temp, bmis_day])
    else:
        obs_final = pd.concat([heights_day, weights_day, bmis_day])
                
    return obs_final

def height_weight_comparison_same_days(observation_days):
    """
    """
    days = observation_days['observation_dte'].unique()
    dfs_temp = []
    for day in days:
        observation_day = observation_days.query(f'observation_dte == "{day}"')
        df_temp = height_weight_comparison_same_day(observation_day)
        dfs_temp.append(df_temp)
    df_final = pd.concat(dfs_temp)
    return df_final

In [65]:
def replace_weight_with_height(observation_patient):
    """
    Correct errors in heights and weights across the patient's history
    
    """
    # 1. check if weight more likely a height given timeline of observations
    heights_patient = observation_patient.query('observation_type=="HEIGHT"')
    obs_other_patient = observation_patient[observation_patient['observation_type'].isin(['HEIGHT', 'WEIGHT']) == False]
    
    # only consider those than are >= 120
    heights_patient = heights_patient[heights_patient['observation_value_cleaned'] > 120]
    height_median_patient = heights_patient['observation_value_cleaned'].median()
    
    # if weight within 2cm of median height then relabel as height
    weights_patient = observation_patient.query('observation_type=="WEIGHT"')
    weights_bad_patient = weights_patient[(weights_patient['observation_value_cleaned'] <= height_median_patient + 2) & 
                                          (weights_patient['observation_value_cleaned'] >= height_median_patient - 2)]
    
    weights_good_patient = weights_patient[weights_patient.index.isin(weights_bad_patient.index) == False]
    
    weights_bad_patient['observation_type'] = "HEIGHT"
    obs_final = pd.concat([obs_other_patient, weights_bad_patient, weights_good_patient, heights_patient])
    
    return obs_final

In [128]:
observation.query('usi.isnull() == False')

For each patient:

- Same day cases (BMI present or not) -> one height weight observation max per day
- All pairs at different dates -> relabel weights and heights if mixed up
- Flag dummy patients

In [101]:
def correct_height_weight_swapped(observations):
    """
    Correct abnormal height and weights values for each of the patients in observations
    
    Args
    ----
    observations (pd.DataFrame): 
    
    Returns
    -------
    observations_tidy (pd.DataFrame)
    """
    
    dfs_patients = []
    
    # separate out the null USI values
    observations_null_usi = observations.query('usi.isnull()')
    observations_not_null_usi = observations.query('usi.isnull() == False')
    
    usi_values = observations_not_null_usi['usi'].unique()
    
    for usi in usi_values:
        observation_patient = observations_not_null_usi.query(f'usi == "{usi}"')
        df_temp = height_weight_comparison_same_days(observation_patient)
        df_temp = replace_weight_with_height(df_temp)
        dfs_patients.append(df_temp)
        
    df_final = pd.concat(dfs_patients)
    df_final = pd.concat([df_final, observations_null_usi])
    
    return df_final

In [67]:
# remove 0 values
hwbmi_nice = hwbmi_nice.query('observation_value_cleaned > 0')

In [68]:
# all patients with at least one recording of a height between 0 and 140
small_heights_usi = hwbmi_nice.query('observation_type == "HEIGHT"').query('observation_value_cleaned < 140')['usi']
small_heights = hwbmi_nice[hwbmi_nice['usi'].isin(small_heights_usi)]

# those that have heights only within the normal range
non_small_heights = hwbmi_nice.iloc[hwbmi_nice.index.isin(small_heights_usi.index) == False, :]

In [129]:
small_heights['usi'] = small_heights.usi.astype(str)

In [130]:
# this can take a while (> 15 mins)
small_heights_corrected = correct_height_weight_swapped(small_heights_temp[small_heights_temp['patientid'].isin(small_heights_patientid)])

In [171]:
small_heights_corrected = small_heights_corrected.sort_values(by=['usi', 'observation_dte'])

# concatenate corrected and normal patient id entries
hwbmi = pd.concat([small_heights_corrected, non_small_heights]).drop_duplicates()

weights_cleaned = hwbmi.query('observation_type == "WEIGHT"')
heights_cleaned = hwbmi.query('observation_type == "HEIGHT"')
bmi_cleaned = hwbmi.query('observation_type == "BMI"')

# only want to have a single observation for a given patient and given observation type at each day
heights_deduped = heights_cleaned.groupby(['usi', 'patientid', 'observation_dte', 'observation_type'])['observation_value_cleaned'].median().reset_index()
weights_deduped = weights_cleaned.groupby(['usi', 'patientid', 'observation_dte', 'observation_type'])['observation_value_cleaned'].median().reset_index()
bmi_deduped = bmi_cleaned.groupby(['usi', 'patientid', 'observation_dte', 'observation_type'])['observation_value_cleaned'].median().reset_index()

In [172]:
# put it back together
hwbmi = pd.concat([heights_deduped, 
                   weights_deduped, 
                   bmi_deduped]).sort_values(by=['usi', 'observation_dte', 'observation_type']).reset_index().iloc[:, 1:]

In [173]:
# include the patientid
hwbmi = hwbmi.drop_duplicates()

## 8. Final cleaning

- Small weights

In [174]:
# finally, clean any small weights
# if weight < 15: multiply by 10
weights_tiny = hwbmi.query('observation_type == "WEIGHT"').query('observation_value_cleaned < 15')

In [175]:
hwbmi.loc[weights_tiny.index, "observation_value_cleaned"] = hwbmi.loc[weights_tiny.index, "observation_value_cleaned"] * 10

## 9. Summary stats

What the BMI, height and weight values look like?

### BMI

In [131]:
hwbmi.query('observation_type == "BMI"')['observation_value_cleaned'].agg([np.min, np.max, np.median, np.mean])

In [132]:
np.quantile(hwbmi.query('observation_type == "BMI"')['observation_value_cleaned'], 
            [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

In [133]:
plt.figure(figsize=(12, 8))
ax = sns.boxplot(hwbmi.query('observation_type == "BMI"')['observation_value_cleaned'])
plt.show()

### Height

In [134]:
hwbmi.query('observation_type == "HEIGHT"')['observation_value_cleaned'].agg([np.min, np.max, np.median, np.mean])

In [135]:
np.quantile(hwbmi.query('observation_type == "HEIGHT"')['observation_value_cleaned'], 
            [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

In [136]:
plt.figure(figsize=(12, 8))
ax = sns.boxplot(hwbmi.query('observation_type == "HEIGHT"')['observation_value_cleaned'])
plt.show()

### Weight

In [137]:
hwbmi.query('observation_type == "WEIGHT"')['observation_value_cleaned'].agg([np.min, np.max, np.median, np.mean])

In [183]:
weights = hwbmi.query('observation_type == "WEIGHT"')

In [138]:
np.quantile(hwbmi.query('observation_type == "WEIGHT"')['observation_value_cleaned'], 
            [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

In [139]:
plt.figure(figsize=(12, 8))
ax = sns.boxplot(hwbmi.query('observation_type == "WEIGHT"')['observation_value_cleaned'])
plt.xlabel('weight (kg)')
plt.title('Distribution of patient weights')
plt.show()

## 5. Output the data

In [187]:
output_folder = 'M:/Working/AL/projects/weight_loss/outputs'

In [188]:
hwbmi.to_csv(f'{output_folder}/height_weight_observations_cleaned_040123.csv', index=False)