In [1]:
# Common Data Errors Examples

# 1. Missing Data:
# Task 1: Review a dataset where some customer emails are missing. Identify how
# many records are incomplete.
# Task 2: Examine a sales dataset with missing transaction dates and determine the
# percentage of missing data.
# Task 3: Identify missing department information in an employee registry.
import pandas as pd
data = {
    'customer_id': [1, 2, 3, 4, 5],
    'customer_name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'email': ['alice@example.com', None, 'charlie@example.com', None, 'eve@example.com']
}
df_customers = pd.DataFrame(data)
missing_emails = df_customers['email'].isnull().sum()
print(f"Number of missing email addresses: {missing_emails}")
employee_data = {
    'employee_id': [1, 2, 3, 4, 5],
    'employee_name': ['John', 'Jane', 'Jim', 'Jack', 'Jill'],
    'department': ['HR', None, 'Sales', None, 'IT']
}
df_employees = pd.DataFrame(employee_data)
missing_departments = df_employees['department'].isnull().sum()
print(f"Number of missing department entries: {missing_departments}")
sales_data = {
    'transaction_id': [101, 102, 103, 104, 105],
    'transaction_date': ['2023-04-01', None, '2023-04-03', None, '2023-04-05'],
    'amount': [250, 300, 150, 450, 600]
}
df_sales = pd.DataFrame(sales_data)

missing_dates_percentage = df_sales['transaction_date'].isnull().mean() * 100
print(f"Percentage of missing transaction dates: {missing_dates_percentage:.2f}%")


#
import pandas as pd

# Task 1: Analyze a customer dataset with duplicate entries and count the number of duplicates
customer_data = {
    'customer_id': [1, 2, 2, 4, 5],
    'customer_name': ['Alice', 'Bob', 'Bob', 'David', 'Eve'],
    'email': ['alice@example.com', 'bob@example.com', 'bob@example.com', 'david@example.com', 'eve@example.com']
}

df_customers = pd.DataFrame(customer_data)
duplicates_customers = df_customers.duplicated().sum()
print(f"Number of duplicate customer records: {duplicates_customers}")

# Task 2: Review supplier data and identify any repeated supplier names
supplier_data = {
    'supplier_id': [1, 2, 3, 4, 5],
    'supplier_name': ['ABC Corp', 'XYZ Ltd', 'ABC Corp', 'LMN Inc', 'XYZ Ltd']
}

df_suppliers = pd.DataFrame(supplier_data)
duplicates_suppliers = df_suppliers['supplier_name'].duplicated().sum()
print(f"Number of repeated supplier names: {duplicates_suppliers}")

# Task 3: Examine a product inventory list for duplicates in product IDs
product_data = {
    'product_id': [101, 102, 103, 102, 105],
    'product_name': ['Laptop', 'Mouse', 'Keyboard', 'Mouse', 'Monitor']
}

df_products = pd.DataFrame(product_data)
duplicates_products = df_products['product_id'].duplicated().sum()
print(f"Number of duplicate product IDs: {duplicates_products}")



#
import pandas as pd

# Task 1: Spot inconsistencies in date formats (e.g., DD/MM/YYYY vs. MM/DD/YYYY)
date_data = {
    'order_id': [1, 2, 3, 4, 5],
    'order_date': ['12/01/2023', '01/12/2023', '03/05/2023', '2023-04-10', '05/11/2023']
}

df_dates = pd.DataFrame(date_data)
df_dates['order_date'] = pd.to_datetime(df_dates['order_date'], errors='coerce')
inconsistent_dates = df_dates[df_dates['order_date'].isna()]
print(f"Inconsistent date formats: {inconsistent_dates}")

# Task 2: Identify phone numbers with varying formats in a contact list
phone_data = {
    'contact_name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'phone_number': ['123-456-7890', '(123) 456-7890', '1234567890', '123.456.7890', '456-7890']
}

df_phones = pd.DataFrame(phone_data)
df_phones['phone_number'] = df_phones['phone_number'].str.replace(r'\D', '', regex=True)
inconsistent_phones = df_phones[df_phones['phone_number'].str.len() != 10]
print(f"Inconsistent phone number formats: {inconsistent_phones}")

# Task 3: Review address data for discrepancies in state abbreviations (e.g., CA vs. Calif.)
address_data = {
    'customer_name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'address': ['123 Main St, CA', '456 Oak St, California', '789 Pine St, CA', '101 Maple St, Calif.', '202 Birch St, CA']
}

df_addresses = pd.DataFrame(address_data)
state_discrepancies = df_addresses[~df_addresses['address'].str.contains(r'\b(CA|California|Calif.)\b')]
print(f"State abbreviation discrepancies: {state_discrepancies}")




#
import pandas as pd
import numpy as np

# Task 1: Compare monthly revenues over six months to identify data drift
revenue_data = {
    'month': ['January', 'February', 'March', 'April', 'May', 'June'],
    'revenue': [5000, 5100, 5200, 4900, 5300, 6000]
}

df_revenue = pd.DataFrame(revenue_data)
revenue_diff = df_revenue['revenue'].pct_change().dropna()
data_drift_revenue = revenue_diff.abs().mean()
print(f"Average monthly revenue drift: {data_drift_revenue:.2f}")

# Task 2: Analyze user engagement metrics from a web application over different quarters
user_engagement_data = {
    'quarter': ['Q1', 'Q2', 'Q3', 'Q4'],
    'active_users': [1200, 1300, 1250, 1400],
    'page_views': [4500, 4600, 4700, 4900]
}

df_engagement = pd.DataFrame(user_engagement_data)
user_engagement_diff = df_engagement[['active_users', 'page_views']].pct_change().dropna()
data_drift_engagement = user_engagement_diff.abs().mean()
print(f"Average user engagement drift: {data_drift_engagement.mean():.2f}")

# Task 3: Review a stock price dataset to detect any anomalies over a year
stock_data = {
    'date': pd.date_range(start='2023-01-01', periods=12, freq='M'),
    'stock_price': [100, 105, 110, 115, 120, 130, 125, 135, 140, 150, 145, 160]
}

df_stock = pd.DataFrame(stock_data)
stock_diff = df_stock['stock_price'].pct_change().dropna()
anomalies_stock = stock_diff[stock_diff.abs() > 0.1]  # Threshold for anomalies (10%)
print(f"Stock price anomalies: {anomalies_stock}")





Number of missing email addresses: 2
Number of missing department entries: 2
Percentage of missing transaction dates: 40.00%
Number of duplicate customer records: 1
Number of repeated supplier names: 2
Number of duplicate product IDs: 1
Inconsistent date formats:    order_id order_date
3         4        NaT
Inconsistent phone number formats:   contact_name phone_number
4          Eve      4567890
State abbreviation discrepancies:   customer_name               address
3         David  101 Maple St, Calif.
Average monthly revenue drift: 0.06
Average user engagement drift: 0.05
Stock price anomalies: 11    0.103448
Name: stock_price, dtype: float64


  state_discrepancies = df_addresses[~df_addresses['address'].str.contains(r'\b(CA|California|Calif.)\b')]
  'date': pd.date_range(start='2023-01-01', periods=12, freq='M'),
