In [None]:
# Common Data Errors Examples

# 1. Missing Data:
# Task 1: Review a dataset where some customer emails are missing. Identify how
# many records are incomplete.
# Task 2: Examine a sales dataset with missing transaction dates and determine the
# percentage of missing data.
# Task 3: Identify missing department information in an employee registry.






# 2. Duplicate Data:
# Task 1: Analyze a customer dataset with duplicate entries and count the number of
# duplicates.
# Task 2: Review supplier data and identify any repeated supplier names.
# Task 3: Examine a product inventory list for duplicates in product IDs.






# 3. Inconsistent Formatting:
# Task 1: Spot inconsistencies in date formats (e.g., DD/MM/YYYY vs. MM/DD/YYYY)
# in a dataset.
# Task 2: Identify phone numbers with varying formats in a contact list.
# Task 3: Review address data for discrepancies in state abbreviations (e.g., CA vs.
# Calif.).





# 4. Data Drift:
# Task 1: Compare monthly revenues over six months to identify data drift.
# Task 2: Analyze user engagement metrics from a web application over different
# quarters.
# Task 3: Review a stock price dataset to detect any anomalies over a year.





In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("/workspaces/AI_DATA_ANALYSIS_/src/Module 3/common_data_errors_example.csv")

# ----------------------
# 1. Missing Data
# ----------------------
# Task 1: Missing customer emails
missing_emails = df['email'].isnull().sum()

# Task 2: Missing transaction dates (percentage)
missing_transaction_dates_pct = df['transaction_date'].isnull().mean() * 100

# Task 3: Missing department information
missing_departments = df['department'].isnull().sum()

# ----------------------
# 2. Duplicate Data
# ----------------------
# Task 1: Duplicate customer entries
duplicate_records = df.duplicated().sum()

# Task 2: Repeated supplier names
duplicate_supplier_names = df[df.duplicated(subset='supplier_name')].shape[0]

# Task 3: Duplicate product IDs
duplicate_product_ids = df[df.duplicated(subset='product_id')].shape[0]

# ----------------------
# 3. Inconsistent Formatting
# ----------------------
# Task 1: Inconsistent date formats
# Convert date column and count parsing errors
df['date_parsed'] = pd.to_datetime(df['date_column'], errors='coerce')
inconsistent_dates = df['date_parsed'].isnull().sum()

# Task 2: Varying phone number formats
df['phone_cleaned'] = df['phone'].str.replace(r'\D', '', regex=True)
unique_phone_formats = df['phone_cleaned'].nunique()

# Task 3: State abbreviation discrepancies
unique_states = df['state'].nunique()
state_variants = df['state'].value_counts().to_dict()

# ----------------------
# 4. Data Drift
# ----------------------
# Task 1: Monthly revenue totals
monthly_revenue = df.groupby('month')['revenue'].sum().to_dict()

# Task 2: Quarterly engagement metrics
quarterly_engagement = df.groupby('quarter')['engagement_score'].mean().to_dict()

# Task 3: Stock price anomalies (mean and std by month)
monthly_price_stats = df.groupby('month')['price'].agg(['mean', 'std']).to_dict()

# Print results
print("Missing Emails:", missing_emails)
print("Missing Transaction Dates (%):", missing_transaction_dates_pct)
print("Missing Departments:", missing_departments)

print("Duplicate Records:", duplicate_records)
print("Duplicate Supplier Names:", duplicate_supplier_names)
print("Duplicate Product IDs:", duplicate_product_ids)

print("Inconsistent Date Formats:", inconsistent_dates)
print("Unique Phone Formats:", unique_phone_formats)
print("Unique States:", unique_states)
print("State Variants:", state_variants)

print("Monthly Revenue:", monthly_revenue)
print("Quarterly Engagement:", quarterly_engagement)
print("Monthly Price Stats:", monthly_price_stats)


Missing Emails: 10
Missing Transaction Dates (%): 14.000000000000002
Missing Departments: 26
Duplicate Records: 0
Duplicate Supplier Names: 95
Duplicate Product IDs: 80
Inconsistent Date Formats: 66
Unique Phone Formats: 100
Unique States: 3
State Variants: {'Calif.': 37, 'CA': 34, 'California': 29}
Monthly Revenue: {'Apr': 58201, 'Feb': 51835, 'Jan': 34629, 'Jun': 61921, 'Mar': 57252, 'May': 49394}
Quarterly Engagement: {'Q1': 0.5906238733129106, 'Q2': 0.4797944867229602, 'Q3': 0.4179876016766743, 'Q4': 0.4746715260075692}
Monthly Price Stats: {'mean': {'Apr': 276.0799924055242, 'Feb': 332.50496106094505, 'Jan': 251.7825858154367, 'Jun': 298.99203787189316, 'Mar': 268.36298987576777, 'May': 301.2546236579025}, 'std': {'Apr': 170.19120897674134, 'Feb': 116.02378466567545, 'Jan': 122.65175350848085, 'Jun': 133.37682752462663, 'Mar': 135.07266555135038, 'May': 92.58567045159683}}
