In [None]:
# Part 1: Load a Dataset & Check Missing Values

# Task 1: Customer Dataset
# - Load a custom CSV file named customer_data.csv .
# - Find any missing values in specific columns like 'Email' and 'Phone'.






# Part 2: Identify Duplicates & Inconsistencies

# Task 2: Duplicate Emails in Customer Dataset
# - Identify duplicate emails which might indicate duplicate customer records.







# Part 3: Generate a Data Quality Report

# Task 3: Customer Dataset Report
# - Summarize the data quality with missing values, duplicates, and inconsistencies for customer_data.csv .







In [3]:
import pandas as pd

# -------------------------------------
# Step 1: Create Sample customer_data.csv
# -------------------------------------
data = {
    'CustomerID': [1, 2, 3, 4, 5, 6, 7],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace'],
    'Email': ['alice@example.com', 'bob@example.com', 'charlie@example.com',
              'bob@example.com', None, 'frank@example.com', 'grace@example.com'],
    'Phone': ['1234567890', '2345678901', None, '2345678901', '3456789012', None, '4567890123'],
    'Gender': ['F', 'M', 'Male', 'male', 'Female', 'female', 'M']
}

# Create and save DataFrame
df_sample = pd.DataFrame(data)
df_sample.to_csv('customer_data.csv', index=False)
print("✅ Sample 'customer_data.csv' file created.\n")

# -------------------------------------
# Step 2: Load the Dataset and Analyze
# -------------------------------------
try:
    df = pd.read_csv("customer_data.csv")
    print("📄 First 5 rows of the customer dataset:")
    print(df.head())

    # --- Part 1: Missing Values ---
    print("\n🔍 Missing Values in Key Columns:")
    print("Email:", df['Email'].isnull().sum())
    print("Phone:", df['Phone'].isnull().sum())

    # --- Part 2: Duplicate Emails ---
    if 'Email' in df.columns:
        duplicate_emails = df[df.duplicated(subset='Email', keep=False)]
        print(f"\n📛 Number of duplicate email entries: {len(duplicate_emails)}")
        if not duplicate_emails.empty:
            print("Sample duplicate email records:")
            print(duplicate_emails.sort_values('Email'))
        else:
            print("No duplicate emails found.")

    # --- Part 3: Data Quality Report ---
    print("\n📋 --- Customer Dataset Quality Report ---")

    # Missing values per column
    print("\n🧩 Missing Values Per Column:")
    print(df.isnull().sum())

    # Duplicate rows
    print(f"\n🧬 Total Duplicate Rows: {df.duplicated().sum()}")

    # Gender inconsistencies
    if 'Gender' in df.columns:
        print("\n🧠 Original 'Gender' values:")
        print(df['Gender'].value_counts(dropna=False))

        # Normalize Gender column
        df['Gender_cleaned'] = df['Gender'].str.strip().str.lower().replace({
            'm': 'male', 'f': 'female'
        })

        print("\n🧹 Cleaned 'Gender' values:")
        print(df['Gender_cleaned'].value_counts(dropna=False))

except FileNotFoundError:
    print("❌ Error: 'customer_data.csv' not found.")


✅ Sample 'customer_data.csv' file created.

📄 First 5 rows of the customer dataset:
   CustomerID     Name                Email         Phone  Gender
0           1    Alice    alice@example.com  1.234568e+09       F
1           2      Bob      bob@example.com  2.345679e+09       M
2           3  Charlie  charlie@example.com           NaN    Male
3           4    David      bob@example.com  2.345679e+09    male
4           5      Eve                  NaN  3.456789e+09  Female

🔍 Missing Values in Key Columns:
Email: 1
Phone: 2

📛 Number of duplicate email entries: 2
Sample duplicate email records:
   CustomerID   Name            Email         Phone Gender
1           2    Bob  bob@example.com  2.345679e+09      M
3           4  David  bob@example.com  2.345679e+09   male

📋 --- Customer Dataset Quality Report ---

🧩 Missing Values Per Column:
CustomerID    0
Name          0
Email         1
Phone         2
Gender        0
dtype: int64

🧬 Total Duplicate Rows: 0

🧠 Original 'Gender' value