In [3]:
import pandas as pd
import os

# Step 1: Create a sample customer_data.csv file for testing
sample_data = """CustomerID,Name,Email,Phone,Country
1,Alice,alice@example.com,123-456-7890,USA
2,Bob,bob@example.com,234-567-8901,Canada
3,Charlie,charlie@example.com,,USA
4,Daisy,,345-678-9012,UK
5,Eve,eve@example.com,456-789-0123,Canada
6,Frank,bob@example.com,234-567-8901,Canada
7,Grace,grace@example.com,567-890-1234,USA
8,Hank,,,
"""

# Save sample CSV to current directory
csv_filename = "customer_data.csv"
with open(csv_filename, "w") as f:
    f.write(sample_data)

print(f"Sample data saved to {os.path.abspath(csv_filename)}")

# Step 2: Load the CSV file
df = pd.read_csv(csv_filename)

# Step 3: Check missing values in 'Email' and 'Phone' columns
missing_email = df['Email'].isnull().sum() if 'Email' in df.columns else None
missing_phone = df['Phone'].isnull().sum() if 'Phone' in df.columns else None

print("\nMissing values:")
print(f"Email: {missing_email}")
print(f"Phone: {missing_phone}")

# Step 4: Identify duplicate emails (potential duplicate customers)
if 'Email' in df.columns:
    duplicate_emails = df[df.duplicated(subset=['Email'], keep=False) & df['Email'].notnull()]
    print("\nDuplicate Emails (possible duplicate records):")
    if duplicate_emails.empty:
        print("No duplicate emails found.")
    else:
        print(duplicate_emails[['CustomerID', 'Name', 'Email']])
else:
    print("\nNo 'Email' column found in the dataset.")

# Step 5: Generate data quality summary report
total_records = len(df)
total_duplicates = df.duplicated().sum()
columns_with_missing = df.columns[df.isnull().any()].tolist()

print("\nData Quality Summary Report:")
print(f"Total records: {total_records}")
print(f"Total duplicate rows: {total_duplicates}")
print(f"Columns with missing values: {columns_with_missing}")

Sample data saved to /workspaces/AI_DATA_ANALYSIS_/src/Module 3/Hands-on - Data Quality Assessment & Profiling/customer_data.csv

Missing values:
Email: 2
Phone: 2

Duplicate Emails (possible duplicate records):
   CustomerID   Name            Email
1           2    Bob  bob@example.com
5           6  Frank  bob@example.com

Data Quality Summary Report:
Total records: 8
Total duplicate rows: 0
Columns with missing values: ['Email', 'Phone', 'Country']
