In [12]:
import pandas as pd
import great_expectations as ge
from great_expectations.validator.validator import Validator
from great_expectations.expectations.expectation import ExpectationConfiguration
from great_expectations.core.expectation_suite import ExpectationSuite

# Step 1: Create sample data
data = {
    "EmployeeID": [101, 102, 103, 104, 105],
    "Age": [25, 30, 28, -1, 45],              # -1 is invalid
    "Email": ["a@x.com", "b@x.com", "invalid", "d@x.com", "e@x.com"],
    "Salary": [50000, 60000, 55000, 70000, None]  # Missing salary
}
df = pd.DataFrame(data)

# Step 2: Create a Validator directly
suite = ExpectationSuite(expectation_suite_name="employee_suite")
validator = Validator(execution_engine=ge.execution_engine.PandasExecutionEngine(), batches=[df], expectation_suite=suite)

# Step 3: Add expectations
validator.expect_column_to_exist("EmployeeID")
validator.expect_column_values_to_be_between("Age", min_value=0, max_value=100)
validator.expect_column_values_to_not_be_null("Salary")
validator.expect_column_values_to_match_regex("Email", r"^[\w\.-]+@[\w\.-]+\.\w+$")

# Step 4: Validate and display result
results = validator.validate()
print(results)

# Optional: Show unmet expectations
for res in results["results"]:
    if not res["success"]:
        print("\n❌ Failed Expectation:")
        print(res["expectation_config"]["expectation_type"])
        print(res["result"])

TypeError: ExpectationSuite.__init__() got an unexpected keyword argument 'expectation_suite_name'

In [5]:
# Task B: Using DQ Labs

# 22. Tool Setup and Configuration:
# - Download and configure DQ Labs on your local environment.
# - Create a new data quality project.








# 23. Data Analysis Automation:
# - Apply DQ Labs for automating data profiling and quality checks.







# 24. Quality Rule Creation:
# - Create quality rules for detecting and handling duplicates or enforcing standards.




import pandas as pd
import numpy as np
import re

# Sample dataset (simulating uploaded data in DQ Labs)
data = {
    'EmployeeID': [101, 102, 103, 103, 105],
    'Name': ['Alice', 'Bob', 'Charlie', 'Charlie', 'Eve'],
    'Age': [25, 30, -1, 28, None],
    'Email': ['alice@example.com', 'bob@example.com', 'invalid_email', 'charlie@example.com', 'eve@example'],
    'Phone': ['(123) 456-7890', '1234567890', '(987) 654-3210', None, '(123)-456-7890']
}

df = pd.DataFrame(data)

print("Original Data:\n", df)

# -------------------------------
# 1. Data Profiling Summary
# -------------------------------
print("\n=== Data Profiling Summary ===")
print(df.describe(include='all'))
print("\nMissing Values:\n", df.isnull().sum())

# -------------------------------
# 2. Rule-based Validation
# -------------------------------
print("\n=== Rule-based Quality Checks ===")

# Rule 1: EmployeeID must be unique
duplicate_ids = df['EmployeeID'].duplicated(keep=False)
print("\nRule: Unique EmployeeID")
print(df[duplicate_ids])

# Rule 2: Age must be between 18 and 65
invalid_age = ~df['Age'].between(18, 65, inclusive='both')
print("\nRule: Age between 18 and 65")
print(df[invalid_age])

# Rule 3: Email should follow proper format
email_pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
invalid_email = ~df['Email'].astype(str).str.match(email_pattern)
print("\nRule: Valid Email Format")
print(df[invalid_email])

# Rule 4: Phone number format (US Style: (XXX) XXX-XXXX)
phone_pattern = r'^\(\d{3}\)\s\d{3}-\d{4}$'
invalid_phone = ~df['Phone'].astype(str).str.match(phone_pattern)
print("\nRule: Valid Phone Number Format (e.g., (123) 456-7890)")
print(df[invalid_phone])

# -------------------------------
# 3. Suggest Cleaned Dataset
# -------------------------------
print("\n=== Cleaned Dataset Suggestion ===")

df_cleaned = df.drop_duplicates(subset=['EmployeeID'])  # Remove ID duplicates
df_cleaned['Age'] = df_cleaned['Age'].apply(lambda x: np.nan if x < 18 or x > 65 else x)  # Fix Age
df_cleaned['Email_Valid'] = df_cleaned['Email'].str.match(email_pattern)
df_cleaned['Phone_Valid'] = df_cleaned['Phone'].str.match(phone_pattern)

print(df_cleaned)





Original Data:
    EmployeeID     Name   Age                Email           Phone
0         101    Alice  25.0    alice@example.com  (123) 456-7890
1         102      Bob  30.0      bob@example.com      1234567890
2         103  Charlie  -1.0        invalid_email  (987) 654-3210
3         103  Charlie  28.0  charlie@example.com            None
4         105      Eve   NaN          eve@example  (123)-456-7890

=== Data Profiling Summary ===
        EmployeeID     Name        Age              Email           Phone
count      5.00000        5   4.000000                  5               4
unique         NaN        4        NaN                  5               4
top            NaN  Charlie        NaN  alice@example.com  (123) 456-7890
freq           NaN        2        NaN                  1               1
mean     102.80000      NaN  20.500000                NaN             NaN
std        1.48324      NaN  14.479871                NaN             NaN
min      101.00000      NaN  -1.000000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Age'] = df_cleaned['Age'].apply(lambda x: np.nan if x < 18 or x > 65 else x)  # Fix Age
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Email_Valid'] = df_cleaned['Email'].str.match(email_pattern)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Phone_Valid'] = df_c