In [1]:
# Part 1: Load a Dataset & Check Missing Values

# Task 1: Customer Dataset
# - Load a custom CSV file named customer_data.csv .
# - Find any missing values in specific columns like 'Email' and 'Phone'.






# Part 2: Identify Duplicates & Inconsistencies

# Task 2: Duplicate Emails in Customer Dataset
# - Identify duplicate emails which might indicate duplicate customer records.







# Part 3: Generate a Data Quality Report

# Task 3: Customer Dataset Report
# - Summarize the data quality with missing values, duplicates, and inconsistencies for customer_data.csv .







In [2]:
import pandas as pd
from io import StringIO

# Create a sample CSV dataset as string for 'customer_data.csv'
csv_data = """
CustomerID,Name,Email,Phone,Age,Country
1,John Doe,john.doe@example.com,123-456-7890,28,USA
2,Jane Smith,jane.smith@example.com,,32,Canada
3,Bob Johnson,bob.johnson@example.com,987-654-3210,45,USA
4,Alice Williams,alice.williams@example.com,555-555-5555,29,UK
5,Chris Green,chris.green@example.com,123-456-7890,40,USA
6,Eve Davis,jane.smith@example.com,444-444-4444,38,Canada
7,Frank Moore,,333-333-3333,50,UK
8,Gina Lee,gina.lee@example.com,,27,USA
9,Hank Kim,hank.kim@example.com,222-222-2222,35,USA
10,Ivy Chen,ivy.chen@example.com,111-111-1111,31,Canada
"""

# Load the dataset from the string (simulate reading CSV)
df = pd.read_csv(StringIO(csv_data))

# --- Part 1: Load Dataset & Check Missing Values ---

print("Task 1: Check missing values in 'Email' and 'Phone' columns:")
missing_email = df['Email'].isnull().sum()
missing_phone = df['Phone'].isnull().sum()
print(f"Missing Emails: {missing_email}")
print(f"Missing Phones: {missing_phone}\n")

# --- Part 2: Identify Duplicate Emails ---

print("Task 2: Duplicate emails in dataset:")
duplicate_emails = df[df.duplicated(subset=['Email'], keep=False) & df['Email'].notna()]
print(duplicate_emails[['CustomerID', 'Name', 'Email']])
print()

# --- Part 3: Generate Data Quality Report ---

print("Task 3: Data Quality Report Summary")

# Missing values summary
print("\nMissing values per column:")
print(df.isnull().sum())

# Duplicate rows summary (based on all columns)
duplicates_all = df[df.duplicated()]
print(f"\nTotal duplicate rows (all columns): {len(duplicates_all)}")

# Duplicate emails summary
num_duplicate_emails = duplicate_emails.shape[0]
print(f"Total duplicate emails found: {num_duplicate_emails}")

# Check for inconsistencies: example, check for leading/trailing spaces or inconsistent case in emails
print("\nChecking inconsistencies in 'Email' column:")
emails_original = df['Email'].dropna().unique()
print(f"Unique emails before cleaning ({len(emails_original)}): {emails_original}")

# Clean emails: strip spaces and lowercase
df['Email_cleaned'] = df['Email'].str.strip().str.lower()
emails_cleaned = df['Email_cleaned'].dropna().unique()
print(f"Unique emails after cleaning ({len(emails_cleaned)}): {emails_cleaned}")

print("\nData quality report completed.")


Task 1: Check missing values in 'Email' and 'Phone' columns:
Missing Emails: 1
Missing Phones: 2

Task 2: Duplicate emails in dataset:
   CustomerID        Name                   Email
1           2  Jane Smith  jane.smith@example.com
5           6   Eve Davis  jane.smith@example.com

Task 3: Data Quality Report Summary

Missing values per column:
CustomerID    0
Name          0
Email         1
Phone         2
Age           0
Country       0
dtype: int64

Total duplicate rows (all columns): 0
Total duplicate emails found: 2

Checking inconsistencies in 'Email' column:
Unique emails before cleaning (8): ['john.doe@example.com' 'jane.smith@example.com' 'bob.johnson@example.com'
 'alice.williams@example.com' 'chris.green@example.com'
 'gina.lee@example.com' 'hank.kim@example.com' 'ivy.chen@example.com']
Unique emails after cleaning (8): ['john.doe@example.com' 'jane.smith@example.com' 'bob.johnson@example.com'
 'alice.williams@example.com' 'chris.green@example.com'
 'gina.lee@example.com'