In [None]:
# Work 26: BMI Data Fusion and Cleanup: Combining Extracted and Calculated BMI Values:
#  [W26.BMI.5.Combine_BMI.ipynb]

# "This notebook merges BMI data from two sources, fills missing values, and saves the cleaned data to a new file."

########################################################################################################
#  Sequence list
########################################################################################################

# 1: Define file paths (ensure these are correctly set)
# 2: Load the data
# 3: Ensure 'Potilas_ID' column exists in both DataFrames
# 4: Ensure 'BMI' column exists in calculated_df
# 5: Debug print to see the contents of calculated_df
# 6: Merge the two DataFrames
# 7: Ensure columns are correctly named after the merge
# 8: Extra check to ensure columns are present before combining
# 9: Debug print to see the contents of merged_df
# 10: Fill any missing BMI values with the ones from the calculated DataFrame
# 11: Drop the intermediate BMI columns
# 12: Save the merged DataFrame to a new file

########################################################################################################
########################################################################################################

import pandas as pd
import time

start_time = time.time()

# 1: Define file paths (ensure these are correctly set)
output_path = "/home/work/BMI_combined.csv"
original_path = "/home/work/BMI_extracted_full.csv"
calculated_path = "/home/work/BMI_calculated.csv"

# 2: Load the data
records_df = pd.read_csv(original_path)
calculated_df = pd.read_csv(calculated_path, sep="|")

print("2: Defined and loaded the data")

# 3: Ensure 'Potilas_ID' column exists in both DataFrames
if "Potilas_ID" not in records_df.columns or "Potilas_ID" not in calculated_df.columns:
    raise KeyError("3: 'Potilas_ID' column is missing in one of the DataFrames.")
else:
    print("3: 'Potilas_ID' column found in both DataFrames")

# 4: Ensure 'BMI' column exists in calculated_df
if "BMI" not in calculated_df.columns:
    raise KeyError("4: 'BMI' column is missing in the calculated DataFrame.")
else:
    print("4: 'BMI' column found in the calculated DataFrame")

# 5: Debug print to see the contents of calculated_df
print("5: Contents of calculated_df:")
print(calculated_df.head())

# 5.1: Inspect inputs and perform a robust merge using the actual join key (Potilas_ID)
print("5.1: left columns:", records_df.columns.tolist())
print("5.1: right columns:", calculated_df.columns.tolist())

# Perform merge with explicit suffixes to keep original/calculated BMI separate when needed
merged_df = pd.merge(
    records_df,
    calculated_df[["Potilas_ID", "BMI"]],
    on="Potilas_ID",
    how="left",
    suffixes=("_original", "_calculated"),
)
print("5.1: merged columns:", merged_df.columns.tolist())

# Compose the final BMI column. Preference: calculated -> original -> existing BMI
if "BMI_calculated" in merged_df.columns and "BMI_original" in merged_df.columns:
    merged_df["BMI"] = merged_df["BMI_calculated"].combine_first(merged_df["BMI_original"])
elif "BMI_calculated" in merged_df.columns:
    merged_df["BMI"] = merged_df["BMI_calculated"]
elif "BMI_original" in merged_df.columns:
    merged_df["BMI"] = merged_df["BMI_original"]
elif "BMI" in merged_df.columns:
    # Keep existing BMI column
    merged_df["BMI"] = merged_df["BMI"]
else:
    raise KeyError("No BMI column found after merge; expected one of: BMI_calculated, BMI_original, BMI")

# Drop intermediate columns if they exist to tidy the DataFrame
merged_df = merged_df.drop(columns=[c for c in ("BMI_original", "BMI_calculated") if c in merged_df.columns])

# Quick check: how many rows still have missing BMI after combining
missing_count = merged_df["BMI"].isna().sum()
print(f"After combining BMI, missing BMI count: {missing_count} / {len(merged_df)}")

# 12: Save the merged DataFrame to a new file
merged_df.to_csv(output_path, index=False, sep="|")

print(f"12: Combined results saved to file {output_path}")

end_time = time.time()
elapsed_time = end_time - start_time
print(f"13: Script completed in {elapsed_time:.2f} seconds")