In [2]:
# Import necessary libraries
import pandas as pd

# Load the datasets
# Replace 'medical_dataset.csv' and 'employment_dataset.csv' with the actual file names or paths
medical_dataset = pd.read_csv('data_wrangling_medical_2024_u7568823.csv')
employment_dataset = pd.read_csv('data_wrangling_education_2024_u7568823.csv')

# Ensure the 'ssn' columns are strings and strip any leading/trailing whitespaces
medical_dataset['ssn'] = medical_dataset['ssn'].astype(str).str.strip()
employment_dataset['ssn'] = employment_dataset['ssn'].astype(str).str.strip()

# Handle missing SSNs by removing rows with null SSN values
medical_dataset = medical_dataset.dropna(subset=['ssn'])
employment_dataset = employment_dataset.dropna(subset=['ssn'])

# Remove duplicate SSNs within each dataset if any
medical_dataset = medical_dataset.drop_duplicates(subset=['ssn'])
employment_dataset = employment_dataset.drop_duplicates(subset=['ssn'])

# Extract unique SSNs from each dataset
unique_ssn_medical = set(medical_dataset['ssn'])
unique_ssn_employment = set(employment_dataset['ssn'])

# Calculate SSNs in common
ssn_common = unique_ssn_medical.intersection(unique_ssn_employment)
count_common = len(ssn_common)

# Calculate SSNs only in the medical dataset
ssn_only_medical = unique_ssn_medical.difference(unique_ssn_employment)
count_only_medical = len(ssn_only_medical)

# Calculate SSNs only in the employment dataset
ssn_only_employment = unique_ssn_employment.difference(unique_ssn_medical)
count_only_employment = len(ssn_only_employment)

# Print the results
print(f"Number of unique SSNs occurred in common in both datasets: {count_common}")
print(f"Number of unique SSNs only in the medical dataset: {count_only_medical}")
print(f"Number of unique SSNs only in the employment dataset: {count_only_employment}")

# Merge the datasets on 'ssn' to create the combined dataset
merged_dataset = pd.merge(medical_dataset, employment_dataset, on='ssn', how='outer', suffixes=('_medical', '_employment'))

# Save the merged dataset to a new CSV file
merged_dataset.to_csv('merged_dataset2.csv', index=False)


Number of unique SSNs occurred in common in both datasets: 16005
Number of unique SSNs only in the medical dataset: 3995
Number of unique SSNs only in the employment dataset: 3185


task 2.3

In [5]:
# For Medical Dataset
# Sample data for the two datasets (replace these with actual datasets)
medical_dataset = pd.read_csv('data_wrangling_medical_2024_u7568823.csv')
employment_dataset = pd.read_csv('data_wrangling_education_2024_u7568823.csv')

duplicate_ssns_medical = medical_dataset[medical_dataset.duplicated(subset='ssn', keep=False)]
count_duplicates_medical = duplicate_ssns_medical['ssn'].nunique()

# For Employment Dataset
duplicate_ssns_employment = employment_dataset[employment_dataset.duplicated(subset='ssn', keep=False)]
count_duplicates_employment = duplicate_ssns_employment['ssn'].nunique()

print(f"Number of duplicate SSNs in the medical dataset: {count_duplicates_medical}")
print(f"Number of duplicate SSNs in the employment dataset: {count_duplicates_employment}")


Number of duplicate SSNs in the medical dataset: 0
Number of duplicate SSNs in the employment dataset: 810


task2.4 easy mode

In [None]:
import pandas as pd

# Assuming 'merged_dataset' is the DataFrame after merging on 'ssn'
# 'merged_dataset' has suffixes '_medical' and '_employment' for overlapping columns

# Load the datasets (assuming they are in CSV format)
medical_data = pd.read_csv('data_wrangling_medical_2024_u7568823.csv')
employment_data = pd.read_csv('data_wrangling_education_2024_u7568823.csv')


# Merge the datasets on the 'ssn' column using an inner join
merged_dataset = pd.merge(medical_data, employment_data, on='ssn', how='inner', suffixes=('_medical', '_employment'))

# List of attributes to compare
attributes_to_compare = ['first_name', 'middle_name', 'last_name', 'gender', 'birth_date',
                         'street_address', 'suburb', 'postcode', 'state', 'phone', 'email']

# Initialize a dictionary to hold counts of inconsistencies
inconsistency_counts = {attr: 0 for attr in attributes_to_compare}

# Group the merged dataset by 'ssn' to handle multiple records per individual
grouped = merged_dataset.groupby('ssn')

for ssn, group in grouped:
    for attr in attributes_to_compare:
        attr_medical = attr + '_medical'
        attr_employment = attr + '_employment'
        
        # Check if both attributes exist
        if attr_medical in group.columns and attr_employment in group.columns:
            # Flag to determine if attribute is consistent for this ssn
            consistent = False
            
            # Iterate through records for this ssn
            for idx, row in group.iterrows():
                val_medical = str(row.get(attr_medical, '')).strip().lower()
                val_employment = str(row.get(attr_employment, '')).strip().lower()
                
                # Check for at least one consistent pair
                if val_medical == val_employment or not val_medical or not val_employment:
                    consistent = True
                    break  # No need to check further
                
            if not consistent:
                # Count inconsistency only once per ssn per attribute
                inconsistency_counts[attr] += 1

# Display the counts of inconsistencies
for attr, count in inconsistency_counts.items():
    print(f"Attribute '{attr}' has {count} inconsistencies.")


task 2.4 comprehensive

In [2]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher

# Load the datasets
medical_data = pd.read_csv('data_wrangling_medical_2024_u7568823.csv')
employment_data = pd.read_csv('data_wrangling_education_2024_u7568823.csv')

# Ensure the 'ssn' columns are strings and strip any leading/trailing whitespaces
medical_dataset['ssn'] = medical_dataset['ssn'].astype(str).str.strip()
employment_dataset['ssn'] = employment_dataset['ssn'].astype(str).str.strip()

# Handle missing SSNs by removing rows with null SSN values
medical_dataset = medical_dataset.dropna(subset=['ssn'])
employment_dataset = employment_dataset.dropna(subset=['ssn'])

# Merge the datasets on 'ssn' using outer join to include all records
merged_dataset = pd.merge(
    medical_dataset, employment_dataset, on='ssn', how='outer', suffixes=('_medical', '_employment')
)

# List of attributes to resolve inconsistencies for
attributes_to_resolve = [
    'first_name', 'middle_name', 'last_name', 'gender', 'birth_date',
    'street_address', 'suburb', 'postcode', 'state', 'phone', 'email'
]

# Function to standardize text attributes
def standardize_text(value):
    if pd.isnull(value):
        return ''
    return str(value).strip().lower()

# Function to compare similarity between two strings
def is_similar(a, b, threshold=0.8):
    return SequenceMatcher(None, a, b).ratio() >= threshold

# Custom function to parse dates with potential '24:' hour issue
from datetime import datetime, timedelta

def parse_date_with_24_hour(date_str):
    if pd.isnull(date_str):
        return pd.NaT
    try:
        # Handle the '24:' hour by replacing it with '00:' and adding one day
        if '24:' in date_str:
            # Replace '24:' with '00:'
            corrected_date_str = date_str.replace('24:', '00:')
            # Parse the date
            parsed_date = pd.to_datetime(corrected_date_str, utc=True)
            # Add one day
            parsed_date += timedelta(days=1)
            return parsed_date
        else:
            # Parse normally
            return pd.to_datetime(date_str, utc=True)
    except Exception as e:
        # If parsing fails, return NaT
        return pd.NaT

# Function to resolve inconsistencies for a single attribute
def resolve_attribute(row, attr):
    attr_medical = attr + '_medical'
    attr_employment = attr + '_employment'
    val_medical = row.get(attr_medical, np.nan)
    val_employment = row.get(attr_employment, np.nan)
    
    # Standardize values
    val_medical_std = standardize_text(val_medical)
    val_employment_std = standardize_text(val_employment)
    
    # If both values are missing, return NaN
    if not val_medical_std and not val_employment_std:
        return np.nan
    
    # If one value is missing, use the other
    if not val_medical_std:
        return val_employment
    if not val_employment_std:
        return val_medical
    
    # Resolve based on attribute type
    if attr in ['first_name', 'middle_name', 'last_name']:
        # Check for typos or common variations
        if val_medical_std == val_employment_std or is_similar(val_medical_std, val_employment_std):
            # Use the more complete or formal version if possible
            return val_medical if len(val_medical_std) >= len(val_employment_std) else val_employment
        else:
            # Keep both names if they are different (e.g., nickname vs. formal name)
            return f"{val_medical} / {val_employment}"
    elif attr == 'gender':
        # Standardize gender codes
        gender_mapping = {'male': 'M', 'm': 'M', 'female': 'F', 'f': 'F'}
        val_medical_std = gender_mapping.get(val_medical_std, val_medical_std.upper())
        val_employment_std = gender_mapping.get(val_employment_std, val_employment_std.upper())
        if val_medical_std == val_employment_std:
            return val_medical_std
        else:
            # Mark as 'Unknown' if inconsistency remains
            return 'Unknown'
    elif attr == 'birth_date':
        # Use custom date parser
        date_medical = parse_date_with_24_hour(val_medical)
        date_employment = parse_date_with_24_hour(val_employment)
        if pd.notnull(date_medical) and pd.notnull(date_employment):
            if date_medical == date_employment:
                return date_medical.date()
            else:
                # Cross-check with age attributes if available
                age_medical = row.get('age_at_consultation', np.nan)
                age_employment = row.get('current_age', np.nan)
                today = pd.Timestamp.today()
                if not pd.isnull(age_medical) and not pd.isnull(age_employment):
                    # Calculate expected birth dates
                    expected_birth_medical = today - pd.Timedelta(days=age_medical * 365)
                    expected_birth_employment = today - pd.Timedelta(days=age_employment * 365)
                    diff_medical = abs((date_medical - expected_birth_medical).days)
                    diff_employment = abs((date_employment - expected_birth_employment).days)
                    return date_medical.date() if diff_medical <= diff_employment else date_employment.date()
                else:
                    # Use the date that is more plausible
                    return date_medical.date() if date_medical.year > 1900 else date_employment.date()
        elif pd.notnull(date_medical):
            return date_medical.date()
        elif pd.notnull(date_employment):
            return date_employment.date()
        else:
            return np.nan
    elif attr in ['street_address', 'suburb', 'postcode', 'state']:
        # Use the most recent address based on event dates
        date_medical_str = row.get('consultation_timestamp', np.nan)
        date_employment_str = row.get('employment_timestamp', np.nan)
        date_medical = parse_date_with_24_hour(date_medical_str)
        date_employment = parse_date_with_24_hour(date_employment_str)
        if pd.notnull(date_medical) and pd.notnull(date_employment):
            if date_medical >= date_employment:
                return val_medical
            else:
                return val_employment
        elif pd.notnull(date_medical):
            return val_medical
        elif pd.notnull(date_employment):
            return val_employment
        else:
            # If dates are not available, use the non-null value
            return val_medical if val_medical_std else val_employment
    elif attr in ['phone', 'email']:
        # Combine contact methods into a list, removing duplicates
        contacts = set()
        if val_medical_std:
            contacts.add(val_medical)
        if val_employment_std:
            contacts.add(val_employment)
        # Return combined contacts separated by semicolon
        return '; '.join(contacts)
    else:
        # For other attributes, prefer non-null over null
        return val_medical if val_medical_std else val_employment

# Apply the resolution function to each attribute
for attr in attributes_to_resolve:
    merged_dataset[attr] = merged_dataset.apply(lambda row: resolve_attribute(row, attr), axis=1)
    
    # Drop the individual columns from medical and employment datasets
    attr_medical = attr + '_medical'
    attr_employment = attr + '_employment'
    if attr_medical in merged_dataset.columns:
        merged_dataset.drop(columns=[attr_medical], inplace=True)
    if attr_employment in merged_dataset.columns:
        merged_dataset.drop(columns=[attr_employment], inplace=True)

# Handle additional attributes unique to each dataset
# Keep other attributes from medical dataset
medical_attrs = [col for col in medical_dataset.columns if col not in ['ssn'] + attributes_to_resolve]
# Keep other attributes from employment dataset
employment_attrs = [col for col in employment_dataset.columns if col not in ['ssn'] + attributes_to_resolve]

# For attributes unique to each dataset, ensure they are included in the merged dataset
for attr in medical_attrs:
    attr_medical = attr + '_medical'
    if attr_medical in merged_dataset.columns:
        merged_dataset[attr] = merged_dataset[attr_medical]
        merged_dataset.drop(columns=[attr_medical], inplace=True)
    elif attr in merged_dataset.columns:
        continue
    else:
        merged_dataset[attr] = np.nan

for attr in employment_attrs:
    attr_employment = attr + '_employment'
    if attr_employment in merged_dataset.columns:
        merged_dataset[attr] = merged_dataset[attr_employment]
        merged_dataset.drop(columns=[attr_employment], inplace=True)
    elif attr in merged_dataset.columns:
        continue
    else:
        merged_dataset[attr] = np.nan

# Now, merged_dataset contains resolved attributes and all other attributes
# You can save the cleaned and merged dataset to a new CSV file
merged_dataset.to_csv('merged_dataset_cleaned.csv', index=False)

# Optionally, display a sample of the cleaned dataset
print(merged_dataset.head())


  return pd.to_datetime(date_str, utc=True)


          ssn  age_at_consultation      medicare_number marital_status  \
0  a100013007                 23.0   8553  21580  1  2     not-married   
1  a100113242                  NaN                  NaN            NaN   
2  a100146498                 24.0   5170  65912  1  3         married   
3  a100176744                  NaN                  NaN            NaN   
4  a100186005                 26.0   5906  91908  2  2     not-married   

   height  weight   bmi  blood_pressure  cholesterol_level  smoking_status  \
0   178.0   -99.0  29.0            78.0              170.0             0.0   
1     NaN     NaN   NaN             NaN                NaN             NaN   
2   180.0   133.0  41.0            67.0              184.0             1.0   
3     NaN     NaN   NaN             NaN                NaN             NaN   
4   169.0   120.0  42.0            81.0              166.0             0.0   

   ... last_name gender  birth_date  \
0  ...  hercules      F  1989-08-06   
1  ...  

task 2.4 comprehensive 2, with counts

In [4]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
from datetime import datetime, timedelta

# Load the datasets (assuming they are in CSV format)
medical_data = pd.read_csv('data_wrangling_medical_2024_u7568823.csv')
employment_data = pd.read_csv('data_wrangling_education_2024_u7568823.csv')

# Ensure the 'ssn' columns are strings and strip any leading/trailing whitespaces
medical_dataset['ssn'] = medical_dataset['ssn'].astype(str).str.strip()
employment_dataset['ssn'] = employment_dataset['ssn'].astype(str).str.strip()

# Handle missing SSNs by removing rows with null SSN values
medical_dataset = medical_dataset.dropna(subset=['ssn'])
employment_dataset = employment_dataset.dropna(subset=['ssn'])

# Merge the datasets on 'ssn' using outer join to include all records
merged_dataset = pd.merge(
    medical_dataset, employment_dataset, on='ssn', how='outer', suffixes=('_medical', '_employment')
)

# List of attributes to compare
attributes_to_compare = [
    'first_name', 'middle_name', 'last_name', 'gender', 'birth_date',
    'street_address', 'suburb', 'postcode', 'state', 'phone', 'email'
]

# Function to standardize text attributes
def standardize_text(value):
    if pd.isnull(value):
        return ''
    return str(value).strip().lower()

# Custom function to parse dates with potential '24:' hour issue
def parse_date_with_24_hour(date_str):
    if pd.isnull(date_str):
        return pd.NaT
    try:
        if '24:' in date_str:
            corrected_date_str = date_str.replace('24:', '00:')
            parsed_date = pd.to_datetime(corrected_date_str, utc=True)
            parsed_date += timedelta(days=1)
            return parsed_date
        else:
            return pd.to_datetime(date_str, utc=True)
    except Exception:
        return pd.NaT

# Function to compare two values for consistency
def compare_values(val_medical, val_employment, attr):
    val_medical_std = standardize_text(val_medical)
    val_employment_std = standardize_text(val_employment)

    # If both values are missing, consider them consistent
    if not val_medical_std and not val_employment_std:
        return True

    # If one value is missing, consider them inconsistent
    if not val_medical_std or not val_employment_std:
        return False

    if attr in ['first_name', 'middle_name', 'last_name']:
        # Use similarity threshold to account for typos
        similarity_ratio = SequenceMatcher(None, val_medical_std, val_employment_std).ratio()
        return similarity_ratio >= 0.8
    elif attr == 'gender':
        gender_mapping = {'male': 'M', 'm': 'M', 'female': 'F', 'f': 'F'}
        val_medical_std = gender_mapping.get(val_medical_std, val_medical_std.upper())
        val_employment_std = gender_mapping.get(val_employment_std, val_employment_std.upper())
        return val_medical_std == val_employment_std
    elif attr == 'birth_date':
        date_medical = parse_date_with_24_hour(val_medical)
        date_employment = parse_date_with_24_hour(val_employment)
        if pd.isnull(date_medical) and pd.isnull(date_employment):
            return True
        return date_medical == date_employment
    else:
        # For other attributes, check exact match
        return val_medical_std == val_employment_std

# Initialize a dictionary to hold counts of inconsistencies
inconsistency_counts = {attr: 0 for attr in attributes_to_compare}

# Get the list of SSNs common to both datasets
common_ssns = set(medical_dataset['ssn']).intersection(set(employment_dataset['ssn']))

# For each SSN, check for inconsistencies
for ssn in common_ssns:
    # Get all records for this SSN in both datasets
    records_medical = medical_dataset[medical_dataset['ssn'] == ssn]
    records_employment = employment_dataset[employment_dataset['ssn'] == ssn]

    for attr in attributes_to_compare:
        attr_medical = attr
        attr_employment = attr

        # Flag to determine if attribute is consistent for this SSN
        consistent = False

        # Check all combinations of records for this SSN
        for idx_medical, row_medical in records_medical.iterrows():
            val_medical = row_medical.get(attr_medical, np.nan)
            for idx_employment, row_employment in records_employment.iterrows():
                val_employment = row_employment.get(attr_employment, np.nan)

                # Compare the values
                if compare_values(val_medical, val_employment, attr):
                    consistent = True
                    break  # No need to check further
            if consistent:
                break  # No need to check further

        if not consistent:
            # Count inconsistency only once per SSN per attribute
            inconsistency_counts[attr] += 1

# Print the counts of inconsistencies
print("Inconsistency counts per attribute:")
for attr, count in inconsistency_counts.items():
    print(f"- {attr}: {count} inconsistencies")


  return pd.to_datetime(date_str, utc=True)
  return pd.to_datetime(date_str, utc=True)
  return pd.to_datetime(date_str, utc=True)


Inconsistency counts per attribute:
- first_name: 0 inconsistencies
- middle_name: 2801 inconsistencies
- last_name: 0 inconsistencies
- gender: 1631 inconsistencies
- birth_date: 0 inconsistencies
- street_address: 6597 inconsistencies
- suburb: 6490 inconsistencies
- postcode: 8358 inconsistencies
- state: 2677 inconsistencies
- phone: 8565 inconsistencies
- email: 6878 inconsistencies


task2.4 integrate function

In [None]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
from datetime import datetime, timedelta

# Load the datasets
medical_dataset = pd.read_csv('data_wrangling_medical_2024_u7568823.csv')
employment_dataset = pd.read_csv('data_wrangling_education_2024_u7568823.csv')

# Ensure the 'ssn' columns are strings and strip any leading/trailing whitespaces
medical_dataset['ssn'] = medical_dataset['ssn'].astype(str).str.strip()
employment_dataset['ssn'] = employment_dataset['ssn'].astype(str).str.strip()

# Handle missing SSNs by removing rows with null SSN values
medical_dataset = medical_dataset.dropna(subset=['ssn'])
employment_dataset = employment_dataset.dropna(subset=['ssn'])

# Merge the datasets on 'ssn' using outer join to include all records
merged_dataset = pd.merge(
    medical_dataset, employment_dataset, on='ssn', how='outer', suffixes=('_medical', '_employment')
)

# List of attributes to resolve inconsistencies for
attributes_to_resolve = [
    'first_name', 'middle_name', 'last_name', 'gender', 'birth_date',
    'street_address', 'suburb', 'postcode', 'state', 'phone', 'email'
]

# Function to standardize text attributes
def standardize_text(value):
    if pd.isnull(value):
        return ''
    return str(value).strip().lower()

# Function to compare similarity between two strings
def is_similar(a, b, threshold=0.8):
    return SequenceMatcher(None, a, b).ratio() >= threshold

# Custom function to parse dates with potential '24:' hour issue
def parse_date_with_24_hour(date_str):
    if pd.isnull(date_str):
        return pd.NaT
    try:
        # Handle the '24:' hour by replacing it with '00:' and adding one day
        if '24:' in date_str:
            # Replace '24:' with '00:'
            corrected_date_str = date_str.replace('24:', '00:')
            # Parse the date
            parsed_date = pd.to_datetime(corrected_date_str, utc=True)
            # Add one day
            parsed_date += timedelta(days=1)
            return parsed_date
        else:
            # Parse normally
            return pd.to_datetime(date_str, utc=True)
    except Exception as e:
        # If parsing fails, return NaT
        return pd.NaT

# Initialize a dictionary to hold counts of inconsistencies
inconsistency_counts = {attr: 0 for attr in attributes_to_resolve}

# Get the list of common SSNs
ssn_medical = set(medical_dataset['ssn'])
ssn_employment = set(employment_dataset['ssn'])
ssn_common = ssn_medical.intersection(ssn_employment)

# Filter the merged dataset to only include records with SSNs common to both datasets
merged_common_ssn = merged_dataset[merged_dataset['ssn'].isin(ssn_common)]

# Group by SSN
grouped = merged_common_ssn.groupby('ssn')

for ssn, group in grouped:
    # For each attribute
    for attr in attributes_to_resolve:
        attr_medical = attr + '_medical'
        attr_employment = attr + '_employment'

        # Get unique values for this attribute from medical and employment datasets
        vals_medical = group[attr_medical].dropna().apply(standardize_text).unique() if attr_medical in group.columns else []
        vals_employment = group[attr_employment].dropna().apply(standardize_text).unique() if attr_employment in group.columns else []

        # If both datasets have values for this attribute for this SSN
        if len(vals_medical) > 0 and len(vals_employment) > 0:
            # Check if there is at least one common value
            match_found = False
            for val_med in vals_medical:
                for val_emp in vals_employment:
                    # For attributes like birth_date, we may need special handling
                    if attr == 'birth_date':
                        # Parse dates
                        date_med = parse_date_with_24_hour(val_med)
                        date_emp = parse_date_with_24_hour(val_emp)
                        if pd.notnull(date_med) and pd.notnull(date_emp):
                            if date_med.date() == date_emp.date():
                                match_found = True
                                break
                    else:
                        if val_med == val_emp:
                            match_found = True
                            break
                if match_found:
                    break
            if not match_found:
                # No matching values found, increment inconsistency count
                inconsistency_counts[attr] += 1
        else:
            # If either dataset does not have values for this attribute for this SSN, consider it consistent
            pass

# Print the counts of inconsistencies
print("Inconsistencies Found Per Attribute:")
for attr, count in inconsistency_counts.items():
    print(f"Attribute '{attr}' has {count} inconsistencies.")

# Function to resolve inconsistencies for a single attribute
def resolve_attribute(row, attr):
    attr_medical = attr + '_medical'
    attr_employment = attr + '_employment'
    val_medical = row.get(attr_medical, np.nan)
    val_employment = row.get(attr_employment, np.nan)
    
    # Standardize values
    val_medical_std = standardize_text(val_medical)
    val_employment_std = standardize_text(val_employment)
    
    # If both values are missing, return NaN
    if not val_medical_std and not val_employment_std:
        return np.nan
    
    # If one value is missing, use the other
    if not val_medical_std:
        return val_employment
    if not val_employment_std:
        return val_medical
    
    # Resolve based on attribute type
    if attr in ['first_name', 'middle_name', 'last_name']:
        # Check for typos or common variations
        if val_medical_std == val_employment_std or is_similar(val_medical_std, val_employment_std):
            # Use the more complete or formal version if possible
            return val_medical if len(val_medical_std) >= len(val_employment_std) else val_employment
        else:
            # Keep both names if they are different (e.g., nickname vs. formal name)
            return f"{val_medical} / {val_employment}"
    elif attr == 'gender':
        # Standardize gender codes
        gender_mapping = {'male': 'M', 'm': 'M', 'female': 'F', 'f': 'F'}
        val_medical_std = gender_mapping.get(val_medical_std, val_medical_std.upper())
        val_employment_std = gender_mapping.get(val_employment_std, val_employment_std.upper())
        if val_medical_std == val_employment_std:
            return val_medical_std
        else:
            # Mark as 'Unknown' if inconsistency remains
            return 'Unknown'
    elif attr == 'birth_date':
        # Use custom date parser
        date_medical = parse_date_with_24_hour(val_medical)
        date_employment = parse_date_with_24_hour(val_employment)
        if pd.notnull(date_medical) and pd.notnull(date_employment):
            if date_medical.date() == date_employment.date():
                return date_medical.date()
            else:
                # Cross-check with age attributes if available
                age_medical = row.get('age_at_consultation', np.nan)
                age_employment = row.get('current_age', np.nan)
                today = pd.Timestamp.today()
                if not pd.isnull(age_medical) and not pd.isnull(age_employment):
                    # Calculate expected birth dates
                    expected_birth_medical = today - pd.Timedelta(days=age_medical * 365)
                    expected_birth_employment = today - pd.Timedelta(days=age_employment * 365)
                    diff_medical = abs((date_medical - expected_birth_medical).days)
                    diff_employment = abs((date_employment - expected_birth_employment).days)
                    return date_medical.date() if diff_medical <= diff_employment else date_employment.date()
                else:
                    # Use the date that is more plausible
                    return date_medical.date() if date_medical.year > 1900 else date_employment.date()
        elif pd.notnull(date_medical):
            return date_medical.date()
        elif pd.notnull(date_employment):
            return date_employment.date()
        else:
            return np.nan
    elif attr in ['street_address', 'suburb', 'postcode', 'state']:
        # Use the most recent address based on event dates
        date_medical_str = row.get('consultation_timestamp', np.nan)
        date_employment_str = row.get('employment_timestamp', np.nan)
        date_medical = parse_date_with_24_hour(date_medical_str)
        date_employment = parse_date_with_24_hour(date_employment_str)
        if pd.notnull(date_medical) and pd.notnull(date_employment):
            if date_medical >= date_employment:
                return val_medical
            else:
                return val_employment
        elif pd.notnull(date_medical):
            return val_medical
        elif pd.notnull(date_employment):
            return val_employment
        else:
            # If dates are not available, use the non-null value
            return val_medical if val_medical_std else val_employment
    elif attr in ['phone', 'email']:
        # Combine contact methods into a list, removing duplicates
        contacts = set()
        if val_medical_std:
            contacts.add(val_medical)
        if val_employment_std:
            contacts.add(val_employment)
        # Return combined contacts separated by semicolon
        return '; '.join(contacts)
    else:
        # For other attributes, prefer non-null over null
        return val_medical if val_medical_std else val_employment

# Apply the resolution function to each attribute
for attr in attributes_to_resolve:
    merged_dataset[attr] = merged_dataset.apply(lambda row: resolve_attribute(row, attr), axis=1)
    
    # Drop the individual columns from medical and employment datasets
    attr_medical = attr + '_medical'
    attr_employment = attr + '_employment'
    if attr_medical in merged_dataset.columns:
        merged_dataset.drop(columns=[attr_medical], inplace=True)
    if attr_employment in merged_dataset.columns:
        merged_dataset.drop(columns=[attr_employment], inplace=True)

# Handle additional attributes unique to each dataset
# Keep other attributes from medical dataset
medical_attrs = [col for col in medical_dataset.columns if col not in ['ssn'] + attributes_to_resolve]
# Keep other attributes from employment dataset
employment_attrs = [col for col in employment_dataset.columns if col not in ['ssn'] + attributes_to_resolve]

# For attributes unique to each dataset, ensure they are included in the merged dataset
for attr in medical_attrs:
    attr_medical = attr + '_medical'
    if attr_medical in merged_dataset.columns:
        merged_dataset[attr] = merged_dataset[attr_medical]
        merged_dataset.drop(columns=[attr_medical], inplace=True)
    elif attr in merged_dataset.columns:
        continue
    else:
        merged_dataset[attr] = np.nan

for attr in employment_attrs:
    attr_employment = attr + '_employment'
    if attr_employment in merged_dataset.columns:
        merged_dataset[attr] = merged_dataset[attr_employment]
        merged_dataset.drop(columns=[attr_employment], inplace=True)
    elif attr in merged_dataset.columns:
        continue
    else:
        merged_dataset[attr] = np.nan

# Now, merged_dataset contains resolved attributes and all other attributes
# You can save the cleaned and merged dataset to a new CSV file
merged_dataset.to_csv('merged_dataset_cleaned.csv', index=False)

# Optionally, display a sample of the cleaned dataset
print("\nSample of the cleaned and merged dataset:")
print(merged_dataset.head())
