In [None]:
# Work 25: Extracting and Summarizing BMI Values from Text Data:
#  [W25.BMI.4.Create_BMI_column.ipynb] 

# "This Jupyter notebook script extracts BMI values from electornic health records text data, 
#  filters improbable values, and computes cohort summaries, saving results to a CSV file."

########################################################################################################
#  Sequence list
########################################################################################################

# 1: Load the data
# 2: Define a function to extract specific term values from the text
# 3: Extract values for each term and compute summaries
# 4: Print summaries for each term
# 5: Define a function to extract BMI value from the text
# 6: Apply the function to extract BMI values to a single column
# 7: Drop rows where BMI could not be extracted
# 8: Select relevant columns and save to a new CSV
# 9: Function to compute cohort summaries
# 10: Compute and print cohort summaries

########################################################################################################
########################################################################################################

import pandas as pd
import re

# 1: Write the paths and load the data
output_path = '/home/work/BMI_extracted_full.csv'
data_path = '/home/work/BMI_records.csv'
data = pd.read_csv(data_path, sep='|')

print("1: Data loaded.")

# 2: Define a function to extract specific term values from the text
def extract_term_value(text, term):
    if not isinstance(text, str):
        return None
    pattern = rf'\b{term}\s*[:=]?\s*(\d+\.?\d*)'  # e.g., "BMI: 25" or "BMI = 25"
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        value = float(match.group(1))
        # Filter out improbable values
        if 10 <= value <= 70:  # Reasonable range for BMI
            return value
    return None

print("2: Defined a function.")

# 3: Extract values for each term and compute summaries
terms = ['BMI', 'painoindeksi', 'PI', 'Paino-pituussuhde']
summaries = {}

for term in terms:
    data[term] = data['data'].apply(lambda x: extract_term_value(x, term))
    valid_data = data.dropna(subset=[term])
    summaries[term] = {
        'min': valid_data[term].min(),
        'max': valid_data[term].max(),
        'mean': valid_data[term].mean(),
        'std': valid_data[term].std(),
        'count': valid_data[term].count()
    }

# 4: Print summaries for each term
for term, summary in summaries.items():
    print(f"Cohort summaries for {term} records:")
    print(f"Min {term}: {summary['min']}")
    print(f"Max {term}: {summary['max']}")
    print(f"Mean {term}: {summary['mean']}")
    print(f"Standard Deviation {term}: {summary['std']}")
    print(f"Count of {term} records: {summary['count']}")

# 5: Define a function to extract BMI value from the text
def extract_bmi(text):
    if not isinstance(text, str):
        return None
    bmi_patterns = [
        r'\bBMI\s*[:=]?\s*(\d+\.?\d*)',  # e.g., "BMI: 25" or "BMI = 25"
        r'\bpainoindeksi\s*[:=]?\s*(\d+\.?\d*)',  # e.g., "painoindeksi: 25" or "painoindeksi = 25"
        r'\bPI\s*[:=]?\s*(\d+\.?\d*)',  # e.g., "PI: 25" or "PI = 25"
        r'\bPaino-pituussuhde\s*[:=]?\s*(\d+\.?\d*)',  # e.g., "Paino-pituussuhde: 25" or "Paino-pituussuhde = 25"
        r'\b(\d+\.?\d*)\s*kg/m²',  # e.g., "25.5 kg/m²"
        r'\b(\d+\.?\d*)\s*kg/m2',  # e.g., "25.5 kg/m2"
        r'\bBMI\s*\(<PUHELINNUMERO>\):\s*(\d+\.?\d*)',  # e.g., "BMI (<PUHELINNUMERO>): 29.69"
        r'\b(?:painoindeksi|BMI)\s*[:=]?\s*(\d+\.?\d*)',  # e.g., "painoindeksi: 25" or "BMI = 25"
        r'\s*::::\s*(\d+\.?\d*)\s*kg/m²',  # e.g., ":::: 25.5 kg/m²"
        r'BMI\s+(\d+\.?\d*)',  # e.g., "BMI 28"
        r'BMI\s+(\d+\.?\d*)\s*kg/m²',  # e.g., "BMI 28 kg/m²"
        r'\s*(\d+\.?\d*)\s*kg/m²',  # e.g., "25.5 kg/m²"
        r'\s*(\d+\.?\d*)\s*kg/m2',  # e.g., "25.5 kg/m2"
        r'BMI\s*[:=]?\s*(\d+\.?\d*)\s*kg/m2',  # e.g., "BMI: 25 kg/m2"
        r'BMI\s*[:=]?\s*(\d+\.?\d*)\s*kg/m²',  # e.g., "BMI: 25 kg/m²"
        r'\s*BMI\s*\d+\.?\d*\s+kg/m2\s*'  # e.g., "BMI 25 kg/m2"
    ]
    for pattern in bmi_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            bmi_value = float(match.group(1))
            # Filter out improbable BMI values
            if 10 <= bmi_value <= 70:  # Reasonable range for BMI
                return bmi_value
    return None

print("5: Defined a function to extract BMI value.")

# 6: Apply the function to extract BMI values to a single column
data['BMI'] = data['data'].apply(extract_bmi)

print("6: Applied the function to extract BMI values to a single column.")

# 7: Drop rows where BMI could not be extracted
data = data.dropna(subset=['BMI'])

print("7: Dropped rows where BMI could not be extracted.")

# 8: Select relevant columns and save to a new CSV
data[['Potilas_ID', 'BMI', 'kirjaus_pvm', 'suorituspaikka', 'otsikko']].to_csv(output_path, index=False)

print(f"8: Selected relevant columns and saved extracted BMI values to {output_path}")

# 9: Function to compute cohort summaries
def compute_cohort_summaries(data, column='BMI'):
    summary = {
        'min': data[column].min(),
        'max': data[column].max(),
        'mean': data[column].mean(),
        'std': data[column].std(),
        'count': data[column].count()
    }
    return summary

# 10: Compute and print cohort summaries
bmi_summary = compute_cohort_summaries(data)
print("Cohort summaries for BMI records:")
print(f"Min BMI: {bmi_summary['min']}")
print(f"Max BMI: {bmi_summary['max']}")
print(f"Mean BMI: {bmi_summary['mean']}")
print(f"Standard Deviation BMI: {bmi_summary['std']}")
print(f"Count of BMI records: {bmi_summary['count']}")


########################################################################################################
########################################################################################################

# 1: Data loaded.
#
# Cohort summaries for BMI records:
#
# Min BMI: 13.0
# Max BMI: 55.0
# Mean BMI: 30.10604460966543
# Standard Deviation BMI: 6.45867034140111
# Count of BMI records: 2690
#
# Cohort summaries for painoindeksi records:
#
# Min painoindeksi: 16.0
# Max painoindeksi: 52.0
# Mean painoindeksi: 28.92083333333333
# Standard Deviation painoindeksi: 5.952542097045846
# Count of painoindeksi records: 192
#
# Cohort summaries for PI records:
#
# Min PI: 23.1
# Max PI: 23.1
# Mean PI: 23.1
# Standard Deviation PI: nan
# Count of PI records: 1
#
# Cohort summaries for Paino-pituussuhde records:
#
# Min Paino-pituussuhde: nan
# Max Paino-pituussuhde: nan
# Mean Paino-pituussuhde: nan
# Standard Deviation Paino-pituussuhde: nan
# Count of Paino-pituussuhde records: 0
#
# Defined a function to extract BMI value.
# Applied the function to extract BMI values to a single column.
# Dropped rows where BMI could not be extracted.
#
# Cohort summaries for BMI records:
#
# Min BMI: 13.0
# Max BMI: 55.0
# Mean BMI: 30.024682622268468
# Standard Deviation BMI: 6.4321971110526075
# Count of BMI records: 2883

