### Task 1: Automated Data Profiling

**Steps**:
1. Using Pandas-Profiling
    - Generate a profile report for an existing CSV file.
    - Customize the profile report to include correlations.
    - Profile a specific subset of columns.
2. Using Great Expectations
    - Create a basic expectation suite for your data.
    - Validate data against an expectation suite.
    - Add multiple expectations to a suite.

In [4]:
# Write your code from here
import pandas as pd
from pandas_profiling import ProfileReport

# 1. Load CSV file
df = pd.read_csv("your_data.csv")  # Replace with your CSV path

# 2. Generate full profile report with correlations
profile = ProfileReport(
    df, 
    title="Data Profiling Report", 
    correlations={
        "pearson": {"calculate": True},
        "spearman": {"calculate": True},
        "kendall": {"calculate": True},
        "phi_k": {"calculate": True},
        "cramers": {"calculate": True},
    }
)

# 3. Save the report as an HTML file
profile.to_file("full_profile_report.html")

# 4. Profile a specific subset of columns
subset_columns = ['Age', 'Income', 'Gender']  # Change as needed
subset_profile = ProfileReport(df[subset_columns], title="Subset Profile Report")
subset_profile.to_file("subset_profile_report.html")
import great_expectations as ge

# 1. Load dataset into GE dataframe
ge_df = ge.from_pandas(df)

# 2. Create a new expectation suite
suite_name = "basic_suite"
context = ge.get_context()

# Create an empty expectation suite
suite = context.create_expectation_suite(suite_name, overwrite_existing=True)

# 3. Add multiple expectations
ge_df.expect_column_values_to_not_be_null("Age")
ge_df.expect_column_values_to_be_between("Age", min_value=0, max_value=120)
ge_df.expect_column_values_to_not_be_null("Income")
ge_df.expect_column_values_to_be_between("Income", min_value=0)
ge_df.expect_column_values_to_be_in_set("Gender", ["Male", "Female", "Other"])

# 4. Validate the dataset against the expectation suite
validation_results = ge_df.validate(expectation_suite=suite_name)

print(validation_results)



PydanticImportError: `BaseSettings` has been moved to the `pydantic-settings` package. See https://docs.pydantic.dev/2.11/migration/#basesettings-has-moved-to-pydantic-settings for more details.

For further information visit https://errors.pydantic.dev/2.11/u/import-error

### Task 2: Real-time Monitoring of Data Quality

**Steps**:
1. Setting up Alerts for Quality Drops
    - Use the logging library to set up a basic alert on failed expectations.
    - Implementing alerts using email notifications.
    - Using a dashboard like Grafana for visual alerts.
        - Note: Example assumes integration with a monitoring system
        - Alert setup would involve creating a data source and alert rule in Grafana

In [None]:
# Write your code from here
import great_expectations as ge
import logging
import smtplib
from email.message import EmailMessage

# Configure logging
logging.basicConfig(
    filename='data_quality.log', 
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s - %(message)s'
)

def send_email_alert(subject, body, to_email):
    # Setup your email credentials & SMTP server here
    smtp_server = "smtp.gmail.com"
    smtp_port = 587
    sender_email = "your_email@gmail.com"
    sender_password = "your_password"

    msg = EmailMessage()
    msg.set_content(body)
    msg['Subject'] = subject
    msg['From'] = sender_email
    msg['To'] = to_email

    try:
        with smtplib.SMTP(smtp_server, smtp_port) as server:
            server.starttls()
            server.login(sender_email, sender_password)
            server.send_message(msg)
        logging.info(f"Alert email sent to {to_email}")
    except Exception as e:
        logging.error(f"Failed to send alert email: {e}")

def check_data_quality_and_alert(df):
    ge_df = ge.from_pandas(df)
    suite_name = "real_time_suite"
    context = ge.get_context()
    suite = context.create_expectation_suite(suite_name, overwrite_existing=True)
    
    # Add expectations
    ge_df.expect_column_values_to_not_be_null("Age")
    ge_df.expect_column_values_to_be_between("Age", min_value=0, max_value=120)
    
    # Validate
    results = ge_df.validate(expectation_suite=suite_name)
    
    # Log results
    if not results['success']:
        logging.warning("Data quality check failed!")
        # Compose alert message
        failed_expectations = [r for r in results['results'] if not r['success']]
        alert_message = f"Data Quality Alert! Failed Expectations:\n{failed_expectations}"
        send_email_alert("Data Quality Alert", alert_message, "recipient@example.com")
    else:
        logging.info("Data quality check passed.")

# Example usage with a sample DataFrame
import pandas as pd

data = {
    "Age": [25, 30, None, 150]  # Contains missing and out-of-range values
}
df = pd.DataFrame(data)

check_data_quality_and_alert(df)


### Task 3: Using AI for Data Quality Monitoring
**Steps**:
1. Basic AI Models for Monitoring
    - Train a simple anomaly detection model using Isolation Forest.
    - Use a simple custom function based AI logic for outlier detection.
    - Creating a monitoring function that utilizes a pre-trained machine learning model.

In [None]:
# Write your code from here
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest

# Step 1: Prepare sample training data with possible anomalies
data = {
    "Age": [25, 30, 35, 40, 45, 150],       # 150 is an outlier
    "Income": [50000, 60000, 75000, 80000, 100000, 10]  # 10 is an outlier
}
df = pd.DataFrame(data)

# Step 2: Train Isolation Forest model
iso_forest = IsolationForest(contamination=0.1, random_state=42)
iso_forest.fit(df)

# Predict anomalies in training data
df['anomaly'] = iso_forest.predict(df)
print("Training Data with Anomaly Flags (-1 = Anomaly, 1 = Normal):")
print(df)
print("\n")

# Step 3: Define a reusable anomaly detection function
def simple_ai_outlier_detection(dataframe, contamination=0.1):
    model = IsolationForest(contamination=contamination, random_state=42)
    model.fit(dataframe)
    preds = model.predict(dataframe)
    dataframe['anomaly'] = preds
    outliers = dataframe[dataframe['anomaly'] == -1]
    return outliers

# Step 4: Define a monitoring function to detect anomalies in new data
def monitor_data_quality(new_data, model):
    preds = model.predict(new_data)
    new_data = new_data.copy()
    new_data['anomaly'] = preds
    anomalies = new_data[new_data['anomaly'] == -1]
    if not anomalies.empty:
        print("Anomalies detected in new data:")
        print(anomalies)
    else:
        print("No anomalies detected in new data.")
    return anomalies

# Step 5: Example new incoming data batch for monitoring
new_data = pd.DataFrame({
    "Age": [28, 33, 200],       # 200 is likely anomaly
    "Income": [52000, 61000, 15]  # 15 is likely anomaly
})

# Step 6: Use the monitoring function with the trained model
detected_anomalies = monitor_data_quality(new_data, iso_forest)
