In [9]:
# Activity 4: Data Quality Automation Tools

# Task A: Using Great Expectations

# 19. Setting Up Expectations:
# - Install Great Expectations and set up a basic expectation suite.
# - Validate a dataset and list unmet expectations.






# 20. Testing for Expectation:
# - Create expectations such as “column values must fall within a certain range.”






# 21. Generating Data Docs:
# - Automatically generate data quality documentation.








In [10]:
# Step 1: Import Great Expectations
import great_expectations as ge

# Step 2: Load your dataset
# Replace 'path/to/your/data.csv' with the actual path to your CSV file
data = ge.read_csv("path/to/your/data.csv")

# Step 3: Create a new expectation suite
suite_name = "my_expectation_suite"
data.create_expectation_suite(suite_name)

# Step 4: Define expectations
# Example expectations
data.expect_column_values_to_be_between("column_name", min_value=0, max_value=100)
data.expect_column_values_to_be_in_set("column_name", ["value1", "value2", "value3"])

# Step 5: Validate the dataset against the expectation suite
results = data.validate(expectation_suite=suite_name)

# Step 6: List unmet expectations
unmet_expectations = [result for result in results['results'] if not result['success']]
print("Unmet Expectations:", unmet_expectations)

# Step 7: Generate data quality documentation
# This step is typically done in the command line, but you can also do it programmatically
# great_expectations docs build
# great_expectations docs serve


FileNotFoundError: [Errno 2] No such file or directory: 'path/to/your/data.csv'

In [None]:
# Task B: Using DQ Labs

# 22. Tool Setup and Configuration:
# - Download and configure DQ Labs on your local environment.
# - Create a new data quality project.








# 23. Data Analysis Automation:
# - Apply DQ Labs for automating data profiling and quality checks.







# 24. Quality Rule Creation:
# - Create quality rules for detecting and handling duplicates or enforcing standards.










In [None]:
import pandas as pd

# Sample customer data
df = pd.DataFrame({
    "customer_id": [1, 2, 3, 4, 5, 5],
    "name": ["Alice", "Bob", "Charlie", "David", "Eve", "Eve"],
    "email": ["alice@example.com", "bob@example.com", "charlie@example.com", "david@example.com", None, None],
    "age": [25, 17, 45, 62, 35, 35],
    "phone": ["1234567890", "0987654321", "123456789", "1112223333", "abcdefghij", "abcdefghij"]
})
print("🔍 Null value summary:")
print(df.isnull().sum())

print("\n📊 Data types:")
print(df.dtypes)

print("\n📈 Descriptive statistics:")
print(df.describe())
duplicates = df[df.duplicated(subset=["customer_id", "email"], keep=False)]
print("❌ Duplicates found:")
print(duplicates)
invalid_age = df[(df["age"] < 18) | (df["age"] > 65)]
print("\n❌ Invalid age values:")
print(invalid_age)
import re

def is_valid_phone(val):
    return bool(re.fullmatch(r"\d{10}", str(val)))

df["phone_valid"] = df["phone"].apply(is_valid_phone)
invalid_phone = df[~df["phone_valid"]]
print("\n❌ Invalid phone numbers:")
print(invalid_phone[["phone"]])
def dq_score(col):
    total = len(col)
    nulls = col.isnull().sum()
    unique_ratio = col.nunique() / total
    score = 100 - (nulls / total) * 50 - (1 - unique_ratio) * 50
    return round(score, 2)

print("\n✅ DQ Scores by Column:")
for column in df.columns[:-1]:
    print(f"{column}: {dq_score(df[column])}/100")
