In [None]:
# Part 1: Load a Dataset & Check Missing Values

# Task 1: Customer Dataset
# - Load a custom CSV file named customer_data.csv .
# - Find any missing values in specific columns like 'Email' and 'Phone'.






# Part 2: Identify Duplicates & Inconsistencies

# Task 2: Duplicate Emails in Customer Dataset
# - Identify duplicate emails which might indicate duplicate customer records.







# Part 3: Generate a Data Quality Report

# Task 3: Customer Dataset Report
# - Summarize the data quality with missing values, duplicates, and inconsistencies for customer_data.csv .







In [1]:
import pandas as pd

# Load the customer dataset from CSV file
customer_df = pd.read_csv('/workspaces/AI_DATA_ANALYSIS_/src/Module 3/Hands-on - Data Quality Assessment & Profiling/customer_data.csv')

# Check for missing values specifically in 'Email' and 'Phone' columns
missing_email = customer_df['Email'].isnull().sum()
missing_phone = customer_df['Phone'].isnull().sum()

print(f"Missing values in 'Email' column: {missing_email}")
print(f"Missing values in 'Phone' column: {missing_phone}")


Missing values in 'Email' column: 1
Missing values in 'Phone' column: 1


In [2]:
# Find duplicate emails (keep=False marks all duplicates True, not just subsequent ones)
duplicate_emails = customer_df[customer_df.duplicated(subset=['Email'], keep=False)]

print(f"Number of duplicate emails: {duplicate_emails['Email'].nunique()}")
print("Duplicate email records:")
print(duplicate_emails[['Email']])


Number of duplicate emails: 1
Duplicate email records:
               Email
0  alice@example.com
6  alice@example.com


In [3]:
# Missing values per column
missing_values = customer_df.isnull().sum()

# Duplicate rows count (considering full row duplicates)
num_duplicate_rows = customer_df.duplicated().sum()

# Inconsistencies example: Let's say 'Gender' column exists and should only contain 'Male' or 'Female'
if 'Gender' in customer_df.columns:
    gender_normalized = customer_df['Gender'].str.lower()
    allowed_genders = {'male', 'female'}
    inconsistent_genders = customer_df[~gender_normalized.isin(allowed_genders)]
else:
    inconsistent_genders = pd.DataFrame()  # empty if no Gender column

# Print Data Quality Report
print("=== Customer Dataset Data Quality Report ===\n")

print("Missing Values per Column:")
print(missing_values)

print(f"\nNumber of Duplicate Rows: {num_duplicate_rows}")

if not inconsistent_genders.empty:
    print("\nInconsistent 'Gender' entries:")
    print(inconsistent_genders[['Gender']])
else:
    print("\nNo inconsistencies detected in 'Gender' column or column not present.")



=== Customer Dataset Data Quality Report ===

Missing Values per Column:
CustomerID    0
Name          0
Email         1
Phone         1
Gender        0
dtype: int64

Number of Duplicate Rows: 0

Inconsistent 'Gender' entries:
  Gender
3      M
4      F
