In [None]:
# Work 7: Combined data [W7.HFRS.1.Combine_data.ipynb] : Creating a new combined dataset"

# Combining and Reshaping Health Diagnoses Data: A Python Jupyter Notebook for Merging and Cleaning

# This notebook merges, reshapes, and cleans diagnostic data from two CSV files, removing duplicates and unifying everything into one dataset for analysis and timing.

########################################################################################################
#  Sequence list
########################################################################################################

# 1: Define file paths
# 2: Select the correct columns
# 3: Start the timer
# 4: Load data using Pandas and show progress
#     4.1: Loading DGN_KAIKKI data
#     4.2: Loading KERTOMUS_DGN data
# 5: Calculate the number of unique Potilas_ID identifiers in each dataset
# 6: Reshape the data into the correct format and show progress
#     6.1: Reshaping ef_dg data
#     6.2: Reshaping ef_dg2 data
# 7: Print the number of rows in the reshaped datasets
# 8: Combine all DataFrames into one using Pandas
# 9: Print the number of rows in the combined dataset
# 10: Remove duplicates and save the results using Pandas
# 11: Print the final number of rows in the dataset
# 12: Stop the timer and show the execution time
# 13: Print the number of unique Potilas_ID identifiers in the merged dataset

########################################################################################################
########################################################################################################

import pandas as pd
import time

# 1: Define file paths
ef_dg_path = '/home/work/dataset1.csv'
ef_dg2_path = '/home/work/dataset2.csv'
output_path =  '/home/work/all_data.csv'

print("1: File paths defined")

# 2: Select the correct columns
ef_dg_columns = ['Potilas_ID', 'pvm', 'koodi1', 'koodi2']
ef_dg2_columns = ['Tunniste', 'pvm', 'koodi1']

print("2: Correct columns selected")

# 3: Start the timer
start_time = time.time()

# 4: Load data using Pandas and show progress
print("4.1: Loading DGN_KAIKKI data...")
ef_dg_df = pd.read_csv(ef_dg_path, sep='|', usecols=ef_dg_columns)

print("4.2: Loading KERTOMUS_DGN data...")
ef_dg2_df = pd.read_csv(ef_dg2_path, sep='|', usecols=ef_dg2_columns)

# 5: Calculate the number of unique Potilas_ID identifiers in each dataset
unique_ef_dg = ef_dg_df['Potilas_ID'].nunique()
unique_ef_dg2 = ef_dg2_df['Tunniste'].nunique()
combined_unique_ids = pd.concat([ef_dg_df['Potilas_ID'], ef_dg2_df['Tunniste']]).nunique()

print(f"5a: Unique Potilas_ID identifiers in ef_dg data: {unique_ef_dg}")
print(f"5b: Unique Potilas_ID identifiers in ef_dg2 data: {unique_ef_dg2}")
print(f"5d: Total unique Potilas_ID identifiers before merging: {combined_unique_ids}")

# 6: Reshape the data into the correct format and show progress
print("6.1: Reshaping ef_dg data...")
ef_dg_df_melted = ef_dg_df.melt(
    id_vars=['Potilas_ID'], 
    value_vars=['koodi1', 'koodi2'], 
    value_name='ICD_code'
).dropna(subset=['ICD_code'])

print("6.2: Reshaping ef_dg2 data...")
ef_dg2_df_melted = ef_dg2_df.rename(columns={'koodi1': 'ICD_code'}).dropna(subset=['ICD_code'])

# 7: Print the number of rows in the reshaped datasets
print(f"ef_dg melted data rows: {len(ef_dg_df_melted)}")
print(f"ef_dg2 melted data rows: {len(ef_dg2_df_melted)}")

# 8: Combine all DataFrames into one using Pandas
print("8: Combining data...")
all_data = pd.concat([ef_dg_df_melted, ef_dg2_df_melted])

# 9: Print the number of rows in the combined dataset
print(f"Combined data rows: {len(all_data)}")

# 10: Remove duplicates and save the results using Pandas
print("10: Removing duplicates and saving results...")
all_data = all_data.drop_duplicates()
all_data.to_csv(output_path, index=False)

# 11: Print the final number of rows in the dataset
print(f"Final data rows after removing duplicates: {len(all_data)}")

# 12: Stop the timer and show the execution time
end_time = time.time()
elapsed_time = end_time - start_time
print(f"12: Merged data saved to path: {output_path}")
print(f"Process took {elapsed_time:.2f} seconds.")

# 13: Print the number of unique Potilas_ID identifiers in the merged dataset
unique_patient_ids = all_data['Potilas_ID'].nunique()
print(f"13: Unique Potilas_ID identifiers in the merged dataset: {unique_patient_ids}")
