**Task 1**: Checking Null Values for Completeness

**Description**: Verify if there are any null values in a dataset, which indicate incomplete data.

In [None]:
import pandas as pd
data = {
    'id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', None, 'David'],
    'email': ['alice@example.com', None, 'charlie@example.com', 'david@example.com']
}
df = pd.DataFrame(data)
null_counts = df.isnull().sum()
total_rows = len(df)
completeness = 1 - (null_counts / total_rows)
print("Null value counts per column:")
print(null_counts)
print("\nCompleteness per column:")
print(completeness.round(2))

**Task 2**: Checking Data Type Validity

**Description**: Ensure that columns contain data of expected types, e.g., ages are integers.

In [None]:
import pandas as pd
data = {
    'id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [25, 'thirty', 45, 22],
    'email': ['a@example.com', 'b@example.com', 'c@example.com', 'd@example.com']
}
df = pd.DataFrame(data)
expected_types = {'id': int, 'name': str, 'age': int, 'email': str}
def check_type_validity(column, expected_type):
    return df[column].apply(lambda x: isinstance(x, expected_type))
validity_results = {col: check_type_validity(col, typ).mean() for col, typ in expected_types.items()}
print(pd.Series(validity_results).round(2))

**Task 3**: Verify Uniqueness of Identifiers

**Description**: Check if a dataset has unique identifiers (e.g., emails).

In [None]:
import pandas as pd
data = {
    'id': [1, 2, 3, 4],
    'email': ['a@example.com', 'b@example.com', 'b@example.com', 'd@example.com']
}
df = pd.DataFrame(data)
is_unique = 1 - df.duplicated(subset=['email']).mean()
print(f"Email Uniqueness Score: {round(is_unique, 2)}")

Task 4: Validate Email Format Using Regex

Description: Validate if email addresses in a dataset have the correct format.

In [None]:
# Write your code from here
import pandas as pd
import re
data = {
    'email': ['a@example.com', 'invalid-email', 'user@domain.co', 'noatsymbol.com']
}
df = pd.DataFrame(data)
email_regex = r'^[\w\.-]+@[\w\.-]+\.\w+$'
df['valid_email'] = df['email'].apply(lambda x: bool(re.match(email_regex, str(x))))
valid_score = df['valid_email'].mean()
print(f"Email Format Accuracy Score: {round(valid_score, 2)}")

Task 5: Check for Logical Age Validity

Description: Ensure ages are within a reasonable human range (e.g., 0-120).

In [None]:
# Write your code from here
import pandas as pd
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [25, 130, 45, -5]
}
df = pd.DataFrame(data)
valid_age = df['age'].apply(lambda x: 0 <= x <= 120)
valid_age_score = valid_age.mean()
print(f"Age Validity Score: {round(valid_age_score, 2)}")

Task 6: Identify and Handle Missing Data

Description: Identify missing values in a dataset and impute them using a simple strategy (e.g., mean).

In [None]:
# Write your code from here
import pandas as pd
data = {
    'id': [1, 2, 3, 4],
    'age': [25, None, 45, None],
    'email': ['a@example.com', 'b@example.com', 'c@example.com', None]
}
df = pd.DataFrame(data)
df['age'] = df['age'].fillna(df['age'].mean())
df['email'] = df['email'].fillna('unknown@example.com')
missing_data_percentage = df.isnull().mean()
print(f"Missing Data Percentages:\n{missing_data_percentage.round(2)}")
print("\nImputed Data:\n", df)

Task 7: Detect Duplicates

Description: Detect duplicate rows in the dataset.

In [None]:
# Write your code from here
import pandas as pd
data = {
    'id': [1, 2, 3, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'Charlie', 'David'],
    'email': ['a@example.com', 'b@example.com', 'c@example.com', 'c@example.com', 'd@example.com']
}
df = pd.DataFrame(data)
duplicates = df[df.duplicated()]
print(f"Detected Duplicates:\n{duplicates}")

Task 8: Validate Correctness of Numerical Values

Description: Ensure numerical columns are within a specified range.

In [None]:
# Write your code from here
import pandas as pd
data = {
    'id': [1, 2, 3, 4],
    'price': [150, 300, 5000, 50]
}
df = pd.DataFrame(data)
price_validity = df['price'].apply(lambda x: 100 <= x <= 1000)
valid_price_score = price_validity.mean()
print(f"Price Validity Score: {round(valid_price_score, 2)}")

Task 9: Custom Completeness Rule Violation Report

Description: Create a report showing which rows violate specific completeness rules, such as mandatory fields being empty.

In [None]:
# Write your code from here
import pandas as pd
data = {
    'id': [1, 2, 3, 4],
    'name': ['Alice', None, 'Charlie', 'David'],
    'email': ['a@example.com', 'b@example.com', None, 'd@example.com']
}
df = pd.DataFrame(data)
mandatory_fields = ['name', 'email']
violations = df[df[mandatory_fields].isnull().any(axis=1)]
print("Rows with Completeness Rule Violations:\n", violations)

Task 10: Advanced Regex for Data Validity Check

Description: Check for validity with advanced regex patterns, such as validating complex fields with multi-level rules.

In [None]:
# Write your code from hereimport pandas as pd
import re
data = {
    'email': ['a@example.com', 'invalid-email', 'user@domain.co', 'user@domain'],
    'phone': ['+1-800-555-1234', '800-555-1234', '12345', '+44 20 7946 0958']
}
df = pd.DataFrame(data)
email_regex = r'^[\w\.-]+@[\w\.-]+\.\w+$'
phone_regex = r'^\+?[\d\-\(\)\s]+$'
df['valid_email'] = df['email'].apply(lambda x: bool(re.match(email_regex, str(x))))
df['valid_phone'] = df['phone'].apply(lambda x: bool(re.match(phone_regex, str(x))))
valid_email_score = df['valid_email'].mean()
valid_phone_score = df['valid_phone'].mean()
print(f"Email Validity Score: {round(valid_email_score, 2)}")
print(f"Phone Validity Score: {round(valid_phone_score, 2)}")