In [None]:
# Part 1: Load a Dataset & Check Missing Values

# Task 1: Customer Dataset
# - Load a custom CSV file named customer_data.csv .
# - Find any missing values in specific columns like 'Email' and 'Phone'.






# Part 2: Identify Duplicates & Inconsistencies

# Task 2: Duplicate Emails in Customer Dataset
# - Identify duplicate emails which might indicate duplicate customer records.







# Part 3: Generate a Data Quality Report

# Task 3: Customer Dataset Report
# - Summarize the data quality with missing values, duplicates, and inconsistencies for customer_data.csv .







In [6]:
import pandas as pd

# -------------------------------------
# Step 1: Create Sample customer_data.csv
# -------------------------------------
def create_sample_dataset():
    data = {
        'CustomerID': [1, 2, 3, 4, 5, 6, 7],
        'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace'],
        'Email': ['alice@example.com', 'bob@example.com', 'charlie@example.com',
                  'bob@example.com', None, 'frank@example.com', 'grace@example.com'],
        'Phone': ['1234567890', '2345678901', None, '2345678901', '3456789012', None, '4567890123'],
        'Gender': ['F', 'M', 'Male', 'male', 'Female', 'female', 'M']
    }

    df_sample = pd.DataFrame(data)
    df_sample.to_csv('customer_data.csv', index=False)
    print("✅ Sample 'customer_data.csv' file created.\n")

# -------------------------------------
# Gender Normalization Utility
# -------------------------------------
def clean_gender(value):
    if pd.isna(value):
        return 'unknown'
    val = str(value).strip().lower()
    if 'm' in val and 'f' not in val:
        return 'male'
    elif 'f' in val:
        return 'female'
    else:
        return 'other'

# -------------------------------------
# Main Analysis Function
# -------------------------------------
def analyze_customer_data(file_path='customer_data.csv'):
    try:
        df = pd.read_csv(file_path)

        print("📄 First 5 rows of the customer dataset:")
        print(df.head())

        print("\n🔍 Checking Missing Values in Key Columns:")
        print(df[['Email', 'Phone']].isnull().sum())

        # Identify duplicate emails
        if 'Email' in df.columns:
            duplicate_emails = df[df.duplicated(subset='Email', keep=False)]
            print(f"\n📛 Duplicate Emails Found: {len(duplicate_emails)}")
            if not duplicate_emails.empty:
                print(duplicate_emails[['CustomerID', 'Name', 'Email']])

        # Clean gender column
        if 'Gender' in df.columns:
            print("\n🧠 Original 'Gender' Value Counts:")
            print(df['Gender'].value_counts(dropna=False))

            df['Gender_cleaned'] = df['Gender'].apply(clean_gender)

            print("\n🧹 Cleaned 'Gender' Value Counts:")
            print(df['Gender_cleaned'].value_counts(dropna=False))

        # ---------------------
        # Data Quality Report
        # ---------------------
        print("\n📋 --- Data Quality Report ---")

        print("\n🧩 Missing Values Per Column:")
        print(df.isnull().sum())

        print("\n📈 Percentage of Missing Values:")
        print((df.isnull().mean() * 100).round(2))

        print("\n🔁 Duplicate Row Count:", df.duplicated().sum())

        print("\n📊 Unique Value Counts:")
        print(df.nunique())

        print("\n🧾 Data Types & Memory Usage:")
        df.info()

        print("\n📉 Descriptive Statistics (Numeric & Categorical):")
        print(df.describe(include='all').transpose())

    except FileNotFoundError:
        print("❌ Error: File not found. Make sure 'customer_data.csv' exists.")
    except pd.errors.EmptyDataError:
        print("❌ Error: The file is empty.")
    except pd.errors.ParserError:
        print("❌ Error: File format issue – could not parse.")
    except Exception as e:
        print(f"❌ Unexpected error: {e}")

# -------------------------------------
# Run the pipeline
# -------------------------------------
create_sample_dataset()
analyze_customer_data()


✅ Sample 'customer_data.csv' file created.

📄 First 5 rows of the customer dataset:
   CustomerID     Name                Email         Phone  Gender
0           1    Alice    alice@example.com  1.234568e+09       F
1           2      Bob      bob@example.com  2.345679e+09       M
2           3  Charlie  charlie@example.com           NaN    Male
3           4    David      bob@example.com  2.345679e+09    male
4           5      Eve                  NaN  3.456789e+09  Female

🔍 Checking Missing Values in Key Columns:
Email    1
Phone    2
dtype: int64

📛 Duplicate Emails Found: 2
   CustomerID   Name            Email
1           2    Bob  bob@example.com
3           4  David  bob@example.com

🧠 Original 'Gender' Value Counts:
Gender
M         2
F         1
Male      1
male      1
Female    1
female    1
Name: count, dtype: int64

🧹 Cleaned 'Gender' Value Counts:
Gender_cleaned
male      4
female    3
Name: count, dtype: int64

📋 --- Data Quality Report ---

🧩 Missing Values Per Column: