In [None]:
# Work23: BMI Calculation from Extracted Height and Weight Data: A Jupyter Notebook Script: 
# [W23.BMI.2.Calculate_BMI_Height_Weight.ipynb] 

# "This notebook extracts height and weight data from records, calculates BMI, and saves the results to a CSV file."

########################################################################################################
#  Sequence list
########################################################################################################

# 1: Load the filtered data
# 2: Print column names for debugging
# 3: Regular expressions to find height and weight
# 4: Extract height and weight for each record
# 5: Create a new DataFrame with the extracted measurements
# 6: Calculate BMI for records with both height and weight
# 7: Save the results
# 8: Script completed in elapsed time

########################################################################################################
########################################################################################################

import pandas as pd
import re
from tqdm import tqdm
import time

start_time = time.time()

# 1: Write paths and load the filtered data
calculated_path = '/home/work/BMI_calculated.csv'
input_path = '/home/work/BMI_records.csv'
filtered_data = pd.read_csv(input_path, sep='|')

# 2: Check columns after loading data
print("2: Columns in the loaded data:", filtered_data.columns)

# 3: Check for missing Potilas_ID values
print("3: Number of missing Potilas_IDs:", filtered_data['Potilas_ID'].isnull().sum())

# 4: Adjusted regular expressions with more flexibility to handle masked or altered context
height_regex = re.compile(
    r'\b(?:(Pituus|PIT|pituus|Korkeus|korkeus)?\s*[:=]?\s*)?(\d{2,3})\s*(?:cm|senttimetri(?:ä|ä)?\b)',
    re.IGNORECASE
)
weight_regex = re.compile(
    r'\b(?:(Paino|PAI|paino|Massa|massa|<NIMI>)?\s*[:=]?\s*)?(\d{2,3})\s*(?:kg|kilogram(?:maa|maa)?\b)',
    re.IGNORECASE
)

def extract_measurements(text):
    # 5: Find all height and weight matches with the adjusted regex
    height_matches = height_regex.findall(text)
    weight_matches = weight_regex.findall(text)
    
    # 6: Extract the number from each match, discarding the context words
    heights = [int(h[1]) for h in height_matches if 100 <= int(h[1]) <= 250]
    weights = [int(w[1]) for w in weight_matches if 20 <= int(w[1]) <= 300]
    
    # 7: Take the first valid match or return None if no match is found
    height = heights[0] if heights else None
    weight = weights[0] if weights else None
    
    return height, weight

print("4-7: Created adjusted regular expressions.")

# 8: Extract height and weight for each record
text_column = 'data'  
measurements = filtered_data[text_column].apply(lambda x: extract_measurements(str(x)))

print("8: Extracted height and weight for each record.")

# 9: Create a new DataFrame with the extracted measurements
measurements_df = pd.DataFrame(measurements.tolist(), columns=['Height_cm', 'Weight_kg'])
filtered_data = pd.concat([filtered_data, measurements_df], axis=1)

# 10: Check extracted data
print("10: Debugging extracted heights and weights:")
print(filtered_data[['Potilas_ID', 'data', 'Height_cm', 'Weight_kg']].head(20))

# 11: Calculate BMI for records with both height and weight
filtered_data['BMI'] = filtered_data.apply(
    lambda row: row['Weight_kg'] / (row['Height_cm'] / 100) ** 2 if pd.notnull(row['Height_cm']) and pd.notnull(row['Weight_kg']) else None,
    axis=1
)

print("11: Calculated BMI for records.")

# 12: Filter out outlier BMI values
filtered_data = filtered_data[(filtered_data['BMI'] >= 10) & (filtered_data['BMI'] <= 70)]

print("12: Filtered out outlier BMI values.")

# 13: Save the results
filtered_data.to_csv(calculated_path, index=False, sep='|')
print(f"13: Results with BMI calculations saved to file {calculated_path}")

# 14: Group by "Potilas_ID" and calculate the sum of BMI
bmi_sum = filtered_data.groupby('Potilas_ID')['BMI'].sum().reset_index()
print("14: Did the Group by 'Potilas_ID' and calculated the sum of BMI.")

# 15: Filter for "Potilas_ID" with BMI sum > 60
filtered_bmi_sum = bmi_sum[bmi_sum['BMI'] > 60]
print("15: Created a Filter for 'Potilas_ID' with BMI sum > 60.")

# 16: Merge back with the original data to get the 'data' column
result_data = filtered_data.merge(filtered_bmi_sum, on='Potilas_ID', how='inner')
print("12: Did a Merge back with the original data to get the 'data' column")

# 17: Save the "data" column to a CSV file
result_data[['Potilas_ID', 'data']].to_csv(output_path, index=False, sep='|')
print(f"17: 'BMI sum > 60' Data saved to {output_path}")

end_time = time.time()
elapsed_time = end_time - start_time
print(f"18: Script completed in {elapsed_time:.2f} seconds")


