In [None]:
# Work 26: BMI Data Fusion and Cleanup: Combining Extracted and Calculated BMI Values: 
#  [W26.BMI.5.Combine_BMI.ipynb] 

# "This notebook merges BMI data from two sources, fills missing values, and saves the cleaned data to a new file."

########################################################################################################
#  Sequence list
########################################################################################################

# 1: Define file paths (ensure these are correctly set)
# 2: Load the data
# 3: Ensure 'Potilas_ID' column exists in both DataFrames
# 4: Ensure 'BMI' column exists in calculated_df
# 5: Debug print to see the contents of calculated_df
# 6: Merge the two DataFrames
# 7: Ensure columns are correctly named after the merge
# 8: Extra check to ensure columns are present before combining
# 9: Debug print to see the contents of merged_df
# 10: Fill any missing BMI values with the ones from the calculated DataFrame
# 11: Drop the intermediate BMI columns
# 12: Save the merged DataFrame to a new file

########################################################################################################
########################################################################################################

import pandas as pd
import re
from tqdm import tqdm
import time

start_time = time.time()

# 1: Define file paths (ensure these are correctly set)
output_path = '/home/work/BMI_combined.csv'
original_path = '/home/work/BMI_extracted_full.csv'
calculated_path = '/home/work/BMI_calculated.csv'

# 2: Load the data
records_df = pd.read_csv(original_path)
calculated_df = pd.read_csv(calculated_path, sep='|')

print("2: Defined and loaded the data")

# 3: Ensure 'Potilas_ID' column exists in both DataFrames
if 'Potilas_ID' not in records_df.columns or 'Potilas_ID' not in calculated_df.columns:
    raise KeyError("3: 'Potilas_ID' column is missing in one of the DataFrames.")
else:
    print("3: 'Potilas_ID' column found in both DataFrames")

# 4: Ensure 'BMI' column exists in calculated_df
if 'BMI' not in calculated_df.columns:
    raise KeyError("4: 'BMI' column is missing in the calculated DataFrame.")
else:
    print("4: 'BMI' column found in the calculated DataFrame")

# 5: Debug print to see the contents of calculated_df
print("5: Contents of calculated_df:")
print(calculated_df.head())

# 6: Merge the two DataFrames
merged_df = pd.merge(records_df, calculated_df[['Potilas_ID', 'BMI']], on='Potilas_ID', how='left', suffixes=('_original', '_calculated'))

# 7: Ensure columns are correctly named after the merge
if 'BMI' in merged_df.columns and 'BMI_calculated' not in merged_df.columns:
    merged_df.rename(columns={'BMI': 'BMI_calculated'}, inplace=True)
    print("7: Renamed 'BMI' to 'BMI_calculated'.")

# 8: Extra check to ensure columns are present before combining
if 'BMI_calculated' not in merged_df.columns or 'BMI_original' not in merged_df.columns:
    raise KeyError("8: Expected columns 'BMI_calculated' or 'BMI_original' are missing after merge.")
else:
    print("8: Both 'BMI_calculated' and 'BMI_original' columns found.")

# 9: Debug print to see the contents of merged_df
print("9: Contents of merged_df after merge:")
print(merged_df.head())

# 10: Fill any missing BMI values with the ones from the calculated DataFrame
if 'BMI_calculated' in merged_df.columns and 'BMI_original' in merged_df.columns:
    merged_df['BMI'] = merged_df['BMI_calculated'].combine_first(merged_df['BMI_original'])
    print("10: Filled any missing BMI values with the ones from the calculated DataFrame")
else:
    print("10: Error: 'BMI_calculated' or 'BMI_original' column is missing in the merged DataFrame")

# 11: Drop the intermediate BMI columns
# merged_df.drop(columns=['BMI_original', 'BMI_calculated'], inplace=True, errors='ignore')

# print("11: Dropped the intermediate BMI columns")

# 12: Save the merged DataFrame to a new file
merged_df.to_csv(output_path, index=False, sep='|')

print(f"12: Combined results saved to file {output_path}")

end_time = time.time()
elapsed_time = end_time - start_time
print(f"13: Script completed in {elapsed_time:.2f} seconds")
