In [None]:
s1=1237568
s2=1238823

In [10]:
import pandas as pd

# Sample data for the two datasets (replace these with actual datasets)
medical_data = pd.read_csv('data_wrangling_medical_2024_u7568823.csv')
employment_data = pd.read_csv('data_wrangling_education_2024_u7568823.csv')

# Perform an inner join on the 'ssn' attribute to get common SSNs
merged_data = pd.merge(medical_data, employment_data, on='ssn', how='inner')
# tempRes= merged_data['ssn'].count()
# Get the number of unique SSNs in both datasets
common_ssns_count = merged_data['ssn'].nunique()

# SSNs unique to the medical dataset
unique_medical_ssns = medical_data[~medical_data['ssn'].isin(employment_data['ssn'])]['ssn'].nunique()

# SSNs unique to the employment dataset
unique_employment_ssns = employment_data[~employment_data['ssn'].isin(medical_data['ssn'])]['ssn'].nunique()

# Output the results
print(f"Number of unique SSNs common in both datasets: {common_ssns_count}")
print(f"Number of SSNs unique to the medical dataset: {unique_medical_ssns}")
print(f"Number of SSNs unique to the employment dataset: {unique_employment_ssns}")

# Save the merged dataset as a new CSV
merged_data.to_csv('merged_dataset1.csv', index=False)


Number of unique SSNs common in both datasets: 16005
Number of SSNs unique to the medical dataset: 3995
Number of SSNs unique to the employment dataset: 3185


task2.3

In [12]:
import pandas as pd

# Load the datasets (assuming they are already in CSV files)
medical_data = pd.read_csv('data_wrangling_medical_2024_u7568823.csv')
employment_data = pd.read_csv('data_wrangling_education_2024_u7568823.csv')

# Find duplicate records based on 'ssn' in both datasets
medical_duplicates = medical_data[medical_data.duplicated(subset=['ssn'], keep=False)]
employment_duplicates = employment_data[employment_data.duplicated(subset=['ssn'], keep=False)]

# Count the number of duplicate records in each dataset
medical_duplicate_count = medical_duplicates['ssn'].nunique()
employment_duplicate_count = employment_duplicates['ssn'].nunique()

# Output the results
print(f"Number of unique SSNs with duplicate records in the medical dataset: {medical_duplicate_count}")
print(f"Number of unique SSNs with duplicate records in the employment dataset: {employment_duplicate_count}")


Number of unique SSNs with duplicate records in the medical dataset: 0
Number of unique SSNs with duplicate records in the employment dataset: 810


task2.4

In [2]:
import pandas as pd

# Load the datasets (assuming they are in CSV format)
medical_data = pd.read_csv('data_wrangling_medical_2024_u7568823.csv')
employment_data = pd.read_csv('data_wrangling_education_2024_u7568823.csv')


# Merge the datasets on the 'ssn' column using an inner join
merged_data = pd.merge(medical_data, employment_data, on='ssn', how='inner', suffixes=('_medical', '_employment'))

# Define the attributes to compare for inconsistencies (those present in both datasets)
attributes_to_compare = ['first_name', 'middle_name', 'last_name', 'birth_date', 'gender', 'street_address', 'suburb', 'postcode', 'state', 'phone', 'email']

# Initialize a dictionary to track inconsistencies for each attribute
inconsistencies = {attribute: 0 for attribute in attributes_to_compare}

# Loop through the attributes and count inconsistencies
for attribute in attributes_to_compare:
    # Compare the medical and employment columns for each attribute
    inconsistent_rows = merged_data[merged_data[f'{attribute}_medical'] != merged_data[f'{attribute}_employment']]
    
    # Count the number of SSNs with inconsistencies for this attribute
    inconsistencies[attribute] = inconsistent_rows['ssn'].nunique()

# Output the number of inconsistencies for each attribute
print("Inconsistencies per attribute:")
for attribute, count in inconsistencies.items():
    print(f"{attribute}: {count}")


Inconsistencies per attribute:
first_name: 0
middle_name: 3110
last_name: 88
birth_date: 13221
gender: 1631
street_address: 6952
suburb: 6851
postcode: 8649
state: 3199
phone: 9490
email: 7434


task 2.4 comprehensive

In [11]:
import pandas as pd

# Load the datasets (assuming they are in CSV format)
medical_data = pd.read_csv('data_wrangling_medical_2024_u7568823.csv')
employment_data = pd.read_csv('data_wrangling_education_2024_u7568823.csv')

# Merge the datasets on the 'ssn' column using an inner join
merged_data = pd.merge(medical_data, employment_data, on='ssn', how='inner', suffixes=('_medical', '_employment'))

# Define the attributes to compare for inconsistencies (those present in both datasets)
attributes_to_compare = ['first_name', 'middle_name', 'last_name', 'birth_date', 'gender', 'street_address', 'suburb', 'postcode', 'state', 'phone', 'email']

# Initialize a dictionary to track inconsistencies for each attribute
inconsistencies = {attribute: 0 for attribute in attributes_to_compare}

# Loop through the attributes and count inconsistencies
for attribute in attributes_to_compare:
    # Compare the medical and employment columns for each attribute
    inconsistent_rows = merged_data[merged_data[f'{attribute}_medical'] != merged_data[f'{attribute}_employment']]
    
    # Count the number of SSNs with inconsistencies for this attribute
    inconsistencies[attribute] = inconsistent_rows['ssn'].nunique()

# Output the number of inconsistencies for each attribute
print("Inconsistencies per attribute:")
for attribute, count in inconsistencies.items():
    print(f"{attribute}: {count}")

# Now, let's resolve the inconsistencies based on specific strategies

def resolve_inconsistencies(row, attribute):
    medical_value = row[f'{attribute}_medical']
    employment_value = row[f'{attribute}_employment']
    
    # If they are the same, return either one (no inconsistency)
    if medical_value == employment_value:
        return medical_value
    
    # Prioritize recency if possible (use timestamp comparison)
    if 'consultation_timestamp' in row and 'employment_timestamp' in row:
        if row['consultation_timestamp'] > row['employment_timestamp']:
            return medical_value  # More recent medical data
        else:
            return employment_value  # More recent employment data
    
    # If recency can't be used, handle the inconsistency based on specific logic:
    if attribute in ['first_name', 'middle_name', 'last_name']:
        # Example: Normalize capitalization issues
        if medical_value.lower() == employment_value.lower():
            return medical_value.capitalize()  # Return normalized version
        else:
            # If names are significantly different, use manual intervention (flag the row for review)
            return f"Manual Review Needed for {medical_value} / {employment_value}"
    
    if attribute == 'birth_date':
        # Example: Normalize date formats if possible
        try:
            medical_date = pd.to_datetime(medical_value)
            employment_date = pd.to_datetime(employment_value)
            if medical_date == employment_date:
                return medical_date.strftime('%Y-%m-%d')
        except:
            pass
        return f"Manual Review Needed for {medical_value} / {employment_value}"
    
    # For other attributes (e.g., phone, address), just choose one or flag for review
    return f"Manual Review Needed for {medical_value} / {employment_value}"

# Apply the function to resolve inconsistencies for each relevant attribute
for attribute in attributes_to_compare:
    merged_data[f'{attribute}_resolved'] = merged_data.apply(resolve_inconsistencies, axis=1, args=(attribute,))

# Display the resolved data (you can also save it to a file)
print("Resolved data preview:")
print(merged_data[[f'{attr}_resolved' for attr in attributes_to_compare]].head())


Inconsistencies per attribute:
first_name: 0
middle_name: 3110
last_name: 88
birth_date: 13221
gender: 1631
street_address: 6952
suburb: 6851
postcode: 8649
state: 3199
phone: 9490
email: 7434
Resolved data preview:
  first_name_resolved middle_name_resolved last_name_resolved  \
0             matthew             mcfarrin            lovette   
1                john                  NaN             stultz   
2              carrie                 hall             greene   
3               diane                    j            gunther   
4              joseph                vance         fitzgerald   

  birth_date_resolved gender_resolved               street_address_resolved  \
0           5/12/1943               m    36  zouch  street  edward  street    
1            6/9/2004               m           533  north  road  longford    
2          17/10/1990               f       258  lyons  street  mater  dei    
3          25/11/1986               f     25  ney  road  hillcrest  garden   

task2.4 comprehensive 2.0, the similarity string (with extra library ) introduced

In [10]:
import pandas as pd
from difflib import SequenceMatcher

# Load the datasets (assuming they are in CSV format)
medical_data = pd.read_csv('data_wrangling_medical_2024_u7568823.csv')
employment_data = pd.read_csv('data_wrangling_education_2024_u7568823.csv')

# Merge the datasets on the 'ssn' column using an inner join
merged_data = pd.merge(medical_data, employment_data, on='ssn', how='inner', suffixes=('_medical', '_employment'))

# Define the attributes to compare for inconsistencies (those present in both datasets)
attributes_to_compare = ['first_name', 'middle_name', 'last_name', 'birth_date', 'gender', 'street_address', 'suburb', 'postcode', 'state', 'phone', 'email']

# Similarity threshold
similarity_threshold = 0.8  # You can adjust this threshold based on the similarity required (0.8 = 80%)

# Function to compute string similarity using SequenceMatcher
def compute_similarity(str1, str2):
    return SequenceMatcher(None, str1, str2).ratio()

def resolve_inconsistencies(row, attribute):
    medical_value = row[f'{attribute}_medical']
    employment_value = row[f'{attribute}_employment']
    
    # If they are exactly the same, return either one (no inconsistency)
    if medical_value == employment_value:
        return medical_value
    
    # Calculate similarity between the two values for name fields
    if attribute in ['first_name', 'middle_name', 'last_name']:
        similarity_score = compute_similarity(medical_value, employment_value)
        
        # If similarity is above the threshold, keep the more recent timestamp value
        if similarity_score >= similarity_threshold:
            if row['consultation_timestamp'] > row['employment_timestamp']:
                return medical_value  # More recent medical data
            else:
                return employment_value  # More recent employment data
        else:
            # If similarity is below the threshold, keep both names
            return f"{medical_value} / {employment_value}"
    
    # For other attributes (e.g., birth_date), we can handle recency or flag for manual review
    if attribute == 'birth_date':
        try:
            medical_date = pd.to_datetime(medical_value)
            employment_date = pd.to_datetime(employment_value)
            if medical_date == employment_date:
                return medical_date.strftime('%Y-%m-%d')
        except:
            pass
        return f"Manual Review Needed for {medical_value} / {employment_value}"
    
    # For other attributes like address or phone, prioritize more recent value if timestamps are available
    if 'consultation_timestamp' in row and 'employment_timestamp' in row:
        if row['consultation_timestamp'] > row['employment_timestamp']:
            return medical_value  # More recent medical data
        else:
            return employment_value  # More recent employment data
    
    # If no specific rule applies, flag for manual review
    return f"Manual Review Needed for {medical_value} / {employment_value}"

# Apply the function to resolve inconsistencies for each relevant attribute
 
for attribute in attributes_to_compare:
    merged_data[f'{attribute}_resolved'] = merged_data.apply(resolve_inconsistencies, axis=1, args=(attribute,))

# Display the resolved data (you can also save it to a file)
print("Resolved data preview:")
print(merged_data[[f'{attr}_resolved' for attr in attributes_to_compare]].head())


TypeError: 'float' object is not iterable

2.4. extra different middle name

In [12]:
import pandas as pd

# Load the datasets (assuming they are in CSV format)
medical_data = pd.read_csv('data_wrangling_medical_2024_u7568823.csv')
employment_data = pd.read_csv('data_wrangling_education_2024_u7568823.csv')

# Merge the datasets on the 'ssn' column using an inner join
merged_data = pd.merge(medical_data, employment_data, on='ssn', how='inner', suffixes=('_medical', '_employment'))

# Find rows where the SSN is the same, but the middle name is different
different_middle_name = merged_data[merged_data['middle_name_medical'] != merged_data['middle_name_employment']]

# Output the filtered rows
print("Rows where SSN is the same but middle name is different:")
print(different_middle_name[['ssn', 'middle_name_medical', 'middle_name_employment']])

# Optional: Save the result to a CSV file
different_middle_name[['ssn', 'middle_name_medical', 'middle_name_employment']].to_csv('ssn_diff_middle_name.csv', index=False)


Rows where SSN is the same but middle name is different:
              ssn middle_name_medical middle_name_employment
1      e141846696              foster                    NaN
4      e145580195                 NaN                  vance
10     d165618464                 NaN               pauletta
35     i150642589               wayne                    NaN
36     g129771524                 NaN                 kappas
...           ...                 ...                    ...
16792  e169611223                 NaN                  jacob
16795  a165306738                 NaN                charles
16797  c155601802               marie                    NaN
16801  a184970069               wayne                    NaN
16804  g191759363                enez                    NaN

[3202 rows x 3 columns]


2.4. extra different email

In [13]:
import pandas as pd

# Load the datasets (assuming they are in CSV format)
medical_data = pd.read_csv('data_wrangling_medical_2024_u7568823.csv')
employment_data = pd.read_csv('data_wrangling_education_2024_u7568823.csv')

# Merge the datasets on 'ssn' using an inner join
merged_data = pd.merge(medical_data, employment_data, on='ssn', how='inner', suffixes=('_medical', '_employment'))

# Filter rows where ssn is the same but email is different
different_email_records = merged_data[merged_data['email_medical'] != merged_data['email_employment']]

# Output the records with the same SSN but different emails
print("Records with the same SSN but different emails:")
print(different_email_records[['ssn', 'email_medical', 'email_employment']])

# Optionally, you can save these records to a CSV file
different_email_records[['ssn', 'email_medical', 'email_employment']].to_csv('ssn_different_email_records.csv', index=False)


Records with the same SSN but different emails:
              ssn           email_medical              email_employment
6      i131945217  ofgmgcaoqj.hotmail.com        ofgmgcaoqj@hotmail.com
7      c143442126                     NaN          polk.allen@gmail.com
8      g154869706                     NaN           transou72@gmail.com
14     b159377382  moore.richard@mail.com                           NaN
16     i172782745                     NaN       irvine.darrin@gmail.com
...           ...                     ...                           ...
16801  a184970069      snipes97.yahoo.com            snipes97@yahoo.com
16804  g191759363                     NaN  finkelstein.jessica@mail.com
16808  d136644076       robbian91.aol.com             robbian91@aol.com
16809  h185183775                     NaN               green25@aol.com
16813  a124600434     isenkcafgt@mail.com                           NaN

[7755 rows x 3 columns]


2.4. extra different email (both not null)

In [14]:
import pandas as pd

# Load the datasets (assuming they are in CSV format)
medical_data = pd.read_csv('data_wrangling_medical_2024_u7568823.csv')
employment_data = pd.read_csv('data_wrangling_education_2024_u7568823.csv')

# Merge the datasets on 'ssn' using an inner join
merged_data = pd.merge(medical_data, employment_data, on='ssn', how='inner', suffixes=('_medical', '_employment'))

# Filter rows where both emails are not null and are different
non_null_email_records = merged_data[
    (merged_data['email_medical'].notna()) &  # Medical email is not null
    (merged_data['email_employment'].notna()) &  # Employment email is not null
    (merged_data['email_medical'] != merged_data['email_employment'])  # Emails are different
]

# Output the records with the same SSN but different emails
print("Records with the same SSN but different emails (and non-null emails):")
print(non_null_email_records[['ssn', 'email_medical', 'email_employment']])

# Optionally, you can save these records to a CSV file
non_null_email_records[['ssn', 'email_medical', 'email_employment']].to_csv('ssn_different_non_null_email_records.csv', index=False)


Records with the same SSN but different emails (and non-null emails):
              ssn            email_medical         email_employment
6      i131945217   ofgmgcaoqj.hotmail.com   ofgmgcaoqj@hotmail.com
19     g197247139     gtcnoncbqr.yahoo.com     gtcnoncbqr@yahoo.com
27     d113054410     ihaijdejck.yahoo.com     ihaijdejck@yahoo.com
47     c126684873  ang.to'chen.hotmail.com  ang.to'chen@hotmail.com
55     f166206248      mfsckakihe.mail.com      mfsckakihe@mail.com
...           ...                      ...                      ...
16752  h131700505   lunsford67.mail.com.au   lunsford67@mail.com.au
16761  f143373531     thompson90.yahoo.com     thompson90@yahoo.com
16794  e155951170    august100.mail.com.au    august100@mail.com.au
16801  a184970069       snipes97.yahoo.com       snipes97@yahoo.com
16808  d136644076        robbian91.aol.com        robbian91@aol.com

[1627 rows x 3 columns]
