### Task 1: Automated Data Profiling

**Steps**:
1. Using Pandas-Profiling
    - Generate a profile report for an existing CSV file.
    - Customize the profile report to include correlations.
    - Profile a specific subset of columns.
2. Using Great Expectations
    - Create a basic expectation suite for your data.
    - Validate data against an expectation suite.
    - Add multiple expectations to a suite.

In [4]:
import pandas as pd
import pandas_profiling
import great_expectations as ge
from great_expectations.core import ExpectationSuite, ExpectationConfiguration
import streamlit as st

def profile_data_with_pandas_profiling(file_path):
    """
    Profiles data from a CSV file using Pandas-Profiling.

    Args:
        file_path (str): The path to the CSV file.
    """
    try:
        # 1. Load the CSV file into a Pandas DataFrame
        df = pd.read_csv(file_path)
        st.write(f"Loaded data from: {file_path}")

        # 2. Generate a profile report with default settings
        profile = pandas_profiling.ProfileReport(df, title="Pandas Profiling Report")

        # Save the report
        profile_file_name = file_path.replace(".csv", "_profile.html")
        profile.to_file(profile_file_name)
        st.success(f"Pandas-Profiling report generated and saved to: {profile_file_name}")

        # Display the report in Streamlit
        st.header("Pandas Profiling Report")
        st.components.v1.html(profile.to_html(), width=800, height=600, scrolling=True)

        # 3. Customize the profile report to include correlations and a subset of columns
        # Calculate correlations (e.g., Pearson correlation)
        profile_with_corr = pandas_profiling.ProfileReport(
            df,
            title="Pandas Profiling Report with Correlations",
            explorative=True,  # Enable more detailed exploration
            correlations={"pearson": {"calculate": True}},  # Calculate Pearson correlation
        )

        # Save the report with correlations
        profile_with_corr_filename = file_path.replace(".csv", "_profile_with_corr.html")
        profile_with_corr.to_file(profile_with_corr_filename)
        st.success(f"Pandas-Profiling report with correlations generated and saved to: {profile_with_corr_filename}")

        # Profile a subset of columns
        if 'age' in df.columns and 'income' in df.columns:  # Check if the columns exist
            subset_profile = pandas_profiling.ProfileReport(
                df[['age', 'income']],  # Profile only 'age' and 'income'
                title="Pandas Profiling Report - Subset of Columns",
            )
            subset_profile_filename = file_path.replace(".csv", "_profile_subset.html")
            subset_profile.to_file(subset_profile_filename)
            st.success(f"Pandas-Profiling report for subset of columns generated and saved to: {subset_profile_filename}")
        else:
            st.warning("Columns 'age' and 'income' not found in the DataFrame.  Skipping subset profiling.")

    except FileNotFoundError:
        st.error(f"Error: File not found at {file_path}")
    except Exception as e:
        st.error(f"An error occurred: {e}")



def profile_data_with_great_expectations(file_path):
    """
    Profiles data from a CSV file using Great Expectations.

    Args:
        file_path (str): The path to the CSV file.
    """
    try:
        # Load the CSV file into a Pandas DataFrame
        df = pd.read_csv(file_path)
        st.write(f"Loaded data from: {file_path}")

        # Create a Great Expectations context
        context = ge.get_context()

        # Create an expectation suite (or load an existing one)
        expectation_suite_name = "my_data_suite"  # You can choose any name
        suite = context.get_expectation_suite(expectation_suite_name)
        if suite is None:
            suite = context.create_expectation_suite(expectation_suite_name)

        # Add multiple expectations to the suite
        suite.add_expectation(
            ge.core.ExpectationConfiguration(  # Use ge.core.ExpectationConfiguration
                expectation_type="expect_table_columns_to_exist",
                kwargs={"column_names": df.columns.tolist()},
            )
        )
        if 'age' in df.columns:
            suite.add_expectation(
                ge.core.ExpectationConfiguration(  # Use ge.core.ExpectationConfiguration
                    expectation_type="expect_column_values_to_be_between",
                    kwargs={
                        "column": "age",
                        "min_value": 0,
                        "max_value": 150,
                    },
                )
            )
        if 'income' in df.columns:
            suite.add_expectation(
                ge.core.ExpectationConfiguration(  # Use ge.core.ExpectationConfiguration
                    expectation_type="expect_column_values_to_not_be_null",
                    kwargs={"column": "income"},
                )
            )

        context.save_expectation_suite(suite, expectation_suite_name=expectation_suite_name) #save the suite

        # Create a BatchRequest
        batch_request = ge.datasource.Datasource.create_batch_request(
            datasource_name="pandas",  # Use the name 'pandas'
            data_asset_name="my_data",  # You can choose any name
            pandas_df=df,  # Pass the DataFrame here
        )

        # 4. Validate data against the expectation suite
        validator = context.get_validator(
            batch_request=batch_request,
            expectation_suite_name=expectation_suite_name,
        )
        validation_results = validator.validate()

        # Print the validation results
        st.header("Great Expectations Validation Results")
        st.json(validation_results)

        # Check if the validation was successful
        if validation_results["success"]:
            st.success("Data validation with Great Expectations was successful!")
        else:
            st.error("Data validation with Great Expectations failed.")

    except FileNotFoundError:
        st.error(f"Error: File not found at {file_path}")
    except Exception as e:
        st.error(f"An error occurred: {e}")



if __name__ == "__main__":
    # Get the file path from the user using st.text_input
    default_file_path = "data.csv"  # Replace with your default CSV file name
    file_path = st.text_input("Enter the path to your CSV file:", default_file_path)

    # Profile data with Pandas-Profiling
    st.header("Data Profiling with Pandas-Profiling")
    profile_data_with_pandas_profiling(file_path)

    # Profile data with Great Expectations
    st.header("Data Profiling with Great Expectations")
    profile_data_with_great_expectations(file_path)


ModuleNotFoundError: No module named 'pandas_profiling'

### Task 2: Real-time Monitoring of Data Quality

**Steps**:
1. Setting up Alerts for Quality Drops
    - Use the logging library to set up a basic alert on failed expectations.
    - Implementing alerts using email notifications.
    - Using a dashboard like Grafana for visual alerts.
        - Note: Example assumes integration with a monitoring system
        - Alert setup would involve creating a data source and alert rule in Grafana

In [None]:
import pandas as pd
import pandas_profiling
import great_expectations as ge
from great_expectations.core import ExpectationSuite, ExpectationConfiguration
import streamlit as st
import logging
import smtplib
from email.mime.text import MIMEText
import datetime
import time
import threading
import random

# Set up logging
logging.basicConfig(
    level=logging.INFO,  # Set the logging level (e.g., INFO, WARNING, ERROR)
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler()],  # Output to the console
)
logger = logging.getLogger(__name__)  # Get the logger for this module


def profile_data_with_pandas_profiling(file_path):
    """
    Profiles data from a CSV file using Pandas-Profiling.

    Args:
        file_path (str): The path to the CSV file.
    """
    try:
        # 1. Load the CSV file into a Pandas DataFrame
        df = pd.read_csv(file_path)
        st.write(f"Loaded data from: {file_path}")

        # 2. Generate a profile report with default settings
        profile = pandas_profiling.ProfileReport(df, title="Pandas Profiling Report")

        # Save the report
        profile_file_name = file_path.replace(".csv", "_profile.html")
        profile.to_file(profile_file_name)
        st.success(f"Pandas-Profiling report generated and saved to: {profile_file_name}")

        # Display the report in Streamlit
        st.header("Pandas Profiling Report")
        st.components.v1.html(profile.to_html(), width=800, height=600, scrolling=True)

        # 3. Customize the profile report to include correlations and a subset of columns
        # Calculate correlations (e.g., Pearson correlation)
        profile_with_corr = pandas_profiling.ProfileReport(
            df,
            title="Pandas Profiling Report with Correlations",
            explorative=True,  # Enable more detailed exploration
            correlations={"pearson": {"calculate": True}},  # Calculate Pearson correlation
        )

        # Save the report with correlations
        profile_with_corr_filename = file_path.replace(".csv", "_profile_with_corr.html")
        profile_with_corr.to_file(profile_with_corr_filename)
        st.success(f"Pandas-Profiling report with correlations generated and saved to: {profile_with_corr_filename}")

        # Profile a subset of columns
        if 'age' in df.columns and 'income' in df.columns:  # Check if the columns exist
            subset_profile = pandas_profiling.ProfileReport(
                df[['age', 'income']],  # Profile only 'age' and 'income'
                title="Pandas Profiling Report - Subset of Columns",
            )
            subset_profile_filename = file_path.replace(".csv", "_profile_subset.html")
            subset_profile.to_file(subset_profile_filename)
            st.success(f"Pandas-Profiling report for subset of columns generated and saved to: {subset_profile_filename}")
        else:
            st.warning("Columns 'age' and 'income' not found in the DataFrame.  Skipping subset profiling.")

    except FileNotFoundError:
        st.error(f"Error: File not found at {file_path}")
    except Exception as e:
        st.error(f"An error occurred: {e}")



def profile_data_with_great_expectations(file_path):
    """
    Profiles data from a CSV file using Great Expectations.

    Args:
        file_path (str): The path to the CSV file.
    """
    try:
        # Load the CSV file into a Pandas DataFrame
        df = pd.read_csv(file_path)
        st.write(f"Loaded data from: {file_path}")

        # Create a Great Expectations context
        context = ge.get_context()

        # Create an expectation suite (or load an existing one)
        expectation_suite_name = "my_data_suite"  # You can choose any name
        suite = context.get_expectation_suite(expectation_suite_name)
        if suite is None:
            suite = context.create_expectation_suite(expectation_suite_name)

        # Add multiple expectations to the suite
        suite.add_expectation(
            ge.core.ExpectationConfiguration(  # Use ge.core.ExpectationConfiguration
                expectation_type="expect_table_columns_to_exist",
                kwargs={"column_names": df.columns.tolist()},
            )
        )
        if 'age' in df.columns:
            suite.add_expectation(
                ge.core.ExpectationConfiguration(  # Use ge.core.ExpectationConfiguration
                    expectation_type="expect_column_values_to_be_between",
                    kwargs={
                        "column": "age",
                        "min_value": 0,
                        "max_value": 150,
                    },
                )
            )
        if 'income' in df.columns:
            suite.add_expectation(
                ge.core.ExpectationConfiguration(  # Use ge.core.ExpectationConfiguration
                    expectation_type="expect_column_values_to_not_be_null",
                    kwargs={"column": "income"},
                )
            )

        context.save_expectation_suite(suite, expectation_suite_name=expectation_suite_name) #save the suite

        # Create a BatchRequest
        batch_request = ge.datasource.Datasource.create_batch_request(
            datasource_name="pandas",  # Use the name 'pandas'
            data_asset_name="my_data",  # You can choose any name
            pandas_df=df,  # Pass the DataFrame here
        )

        # 4. Validate data against the expectation suite
        validator = context.get_validator(
            batch_request=batch_request,
            expectation_suite_name=expectation_suite_name,
        )
        validation_results = validator.validate()

        # Print the validation results
        st.header("Great Expectations Validation Results")
        st.json(validation_results)

        # 5. Check if the validation was successful
        if validation_results["success"]:
            st.success("Data validation with Great Expectations was successful!")
            return True  # Return True for success
        else:
            st.error("Data validation with Great Expectations failed!")
            return False  # Return False for failure

    except FileNotFoundError:
        st.error(f"Error: File not found at {file_path}")
        return False
    except Exception as e:
        st.error(f"An error occurred: {e}")
        return False



def send_email_alert(subject, body):
    """
    Sends an email alert.

    Args:
        subject (str): The subject of the email.
        body (str): The body of the email.
    """
    sender_email = "your_email@example.com"  # Replace with your email address
    receiver_email = "recipient_email@example.com"  # Replace with the recipient's email address
    smtp_server = "smtp.example.com"  # Replace with your SMTP server address
    smtp_port = 587  # Replace with your SMTP server port (e.g., 587 for TLS)
    smtp_username = "your_email@example.com"  # Replace with your email username
    smtp_password = "your_email_password"  # Replace with your email password or an app password

    msg = MIMEText(body)
    msg["Subject"] = subject
    msg["From"] = sender_email
    msg["To"] = receiver_email

    try:
        server = smtplib.SMTP(smtp_server, smtp_port)
        server.starttls()
        server.login(smtp_username, smtp_password)
        server.sendmail(sender_email, [receiver_email], msg.as_string())
        server.quit()
        logger.info("Email alert sent successfully.")
    except Exception as e:
        logger.error(f"Error sending email alert: {e}")



def monitor_data_quality(file_path):
    """
    Monitors data quality in real-time (simulated) by repeatedly profiling
    data with Great Expectations and sending alerts if quality drops.

    Args:
        file_path (str): The path to the CSV file.
    """
    while True:
        # Simulate data updates (replace with your actual data source)
        try:
            # In a real-world scenario, you would read data from a stream,
            # a database, or an API.  Here, we simulate reading from a file
            # that might be updated periodically.
            df = pd.read_csv(file_path) #read the file
            # Introduce some random changes to simulate data changes
            if random.random() < 0.2:  # 20% chance of introducing a change
                if 'age' in df.columns:
                    df['age'] = df['age'].apply(lambda x: x + random.randint(-5, 5))  # Change age
                if 'income' in df.columns:
                    df['income'] = df['income'].apply(lambda x: x * (1 + random.uniform(-0.1, 0.1)) if pd.notnull(x) else None) #change income

            df.to_csv(file_path, index=False)  # Save the modified DataFrame back to the CSV file
            logger.info(f"Simulated data update at {datetime.datetime.now()}")
        except Exception as e:
            logger.error(f"Error reading or updating data: {e}")

        # Profile data with Great Expectations
        validation_status = profile_data_with_great_expectations(file_path)

        if not validation_status:
            # 1. Set up a basic alert using the logging library
            logger.error("Data quality check failed!")

            # 2. Implement alerts using email notifications
            subject = "Data Quality Alert"
            body = f"Data quality check failed at {datetime.datetime.now()}.  Please investigate the data in {file_path}."
            send_email_alert(subject, body)

            # 3.  Integrate with a dashboard like Grafana for visual alerts
            #    (Conceptual - requires a separate Grafana setup)
            #    In a real application, you would send metrics to Grafana
            #    (e.g., number of failed expectations) and configure
            #    alerts in Grafana based on those metrics.
            #    This is a placeholder for that functionality.
            logger.info(
                "Sending metrics to Grafana (simulated).  "
                "In a real setup, configure Grafana to alert on these metrics."
            )
            #  Example Grafana metric (conceptual):
            #  metric_name: data_quality_check_failed
            #  value: 1 (if failed), 0 (if passed)
            #  In a real implementation, you'd use a Grafana client library
            #  to send this metric.

        time.sleep(60)  # Check every 60 seconds (adjust as needed)



if __name__ == "__main__":
    # Get the file path from the user using st.text_input
    default_file_path = "data.csv"  # Replace with your default CSV file name
    file_path = st.text_input("Enter the path to your CSV file:", default_file_path)

    # Create a sample CSV file if it doesn't exist
    if not pd.io.common.file_exists(file_path):
        df = pd.DataFrame({
            'age': [20, 30, 40, 50, 60],
            'income': [50000, 60000, 70000, 80000, 90000]
        })
        df.to_csv(file_path, index=False)
        st.info(f"Created a sample CSV file at {file_path}.  You can replace this with your own data.")

    # Profile data with Pandas-Profiling
    st.header("Data Profiling with Pandas-Profiling")
    profile_data_with_pandas_profiling(file_path)

    # Profile data with Great Expectations
    st.header("Data Profiling with Great Expectations")
    profile_data_with_great_expectations(file_path)

    # Start real-time monitoring in a separate thread
    monitoring_thread = threading.Thread(target=monitor_data_quality, args=(file_path,))
    monitoring_thread.daemon = True  # Allow the main thread to exit
    monitoring_thread.start()

    st.subheader("Real-time Data Quality Monitoring")
    st.write("Data quality is being monitored in the background.  "
             "Check the console for log messages and email alerts (if any issues are detected).")

    # Keep the main thread alive.
    while True:
        time.sleep(600)  # Check every 10 minutes
        st.info("The main thread is still alive. Monitoring is running in the background...")


ModuleNotFoundError: No module named 'pandas_profiling'

### Task 3: Using AI for Data Quality Monitoring
**Steps**:
1. Basic AI Models for Monitoring
    - Train a simple anomaly detection model using Isolation Forest.
    - Use a simple custom function based AI logic for outlier detection.
    - Creating a monitoring function that utilizes a pre-trained machine learning model.

In [None]:
import pandas as pd
import pandas_profiling
import great_expectations as ge
from great_expectations.core import ExpectationSuite, ExpectationConfiguration
import streamlit as st
import logging
import smtplib
from email.mime.text import MIMEText
import datetime
import time
import threading
import random
from sklearn.ensemble import IsolationForest
import numpy as np

# Set up logging
logging.basicConfig(
    level=logging.INFO,  # Set the logging level (e.g., INFO, WARNING, ERROR)
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler()],  # Output to the console
)
logger = logging.getLogger(__name__)  # Get the logger for this module


def profile_data_with_pandas_profiling(file_path):
    """
    Profiles data from a CSV file using Pandas-Profiling.

    Args:
        file_path (str): The path to the CSV file.
    """
    try:
        # 1. Load the CSV file into a Pandas DataFrame
        df = pd.read_csv(file_path)
        st.write(f"Loaded data from: {file_path}")

        # 2. Generate a profile report with default settings
        profile = pandas_profiling.ProfileReport(df, title="Pandas Profiling Report")

        # Save the report
        profile_file_name = file_path.replace(".csv", "_profile.html")
        profile.to_file(profile_file_name)
        st.success(f"Pandas-Profiling report generated and saved to: {profile_file_name}")

        # Display the report in Streamlit
        st.header("Pandas Profiling Report")
        st.components.v1.html(profile.to_html(), width=800, height=600, scrolling=True)

        # 3. Customize the profile report to include correlations and a subset of columns
        # Calculate correlations (e.g., Pearson correlation)
        profile_with_corr = pandas_profiling.ProfileReport(
            df,
            title="Pandas Profiling Report with Correlations",
            explorative=True,  # Enable more detailed exploration
            correlations={"pearson": {"calculate": True}},  # Calculate Pearson correlation
        )

        # Save the report with correlations
        profile_with_corr_filename = file_path.replace(".csv", "_profile_with_corr.html")
        profile_with_corr.to_file(profile_with_corr_filename)
        st.success(f"Pandas-Profiling report with correlations generated and saved to: {profile_with_corr_filename}")

        # Profile a subset of columns
        if 'age' in df.columns and 'income' in df.columns:  # Check if the columns exist
            subset_profile = pandas_profiling.ProfileReport(
                df[['age', 'income']],  # Profile only 'age' and 'income'
                title="Pandas Profiling Report - Subset of Columns",
            )
            subset_profile_filename = file_path.replace(".csv", "_profile_subset.html")
            subset_profile.to_file(subset_profile_filename)
            st.success(f"Pandas-Profiling report for subset of columns generated and saved to: {subset_profile_filename}")
        else:
            st.warning("Columns 'age' and 'income' not found in the DataFrame.  Skipping subset profiling.")

    except FileNotFoundError:
        st.error(f"Error: File not found at {file_path}")
    except Exception as e:
        st.error(f"An error occurred: {e}")



def profile_data_with_great_expectations(file_path):
    """
    Profiles data from a CSV file using Great Expectations.

    Args:
        file_path (str): The path to the CSV file.
    """
    try:
        # Load the CSV file into a Pandas DataFrame
        df = pd.read_csv(file_path)
        st.write(f"Loaded data from: {file_path}")

        # Create a Great Expectations context
        context = ge.get_context()

        # Create an expectation suite (or load an existing one)
        expectation_suite_name = "my_data_suite"  # You can choose any name
        suite = context.get_expectation_suite(expectation_suite_name)
        if suite is None:
            suite = context.create_expectation_suite(expectation_suite_name)

        # Add multiple expectations to the suite
        suite.add_expectation(
            ge.core.ExpectationConfiguration(  # Use ge.core.ExpectationConfiguration
                expectation_type="expect_table_columns_to_exist",
                kwargs={"column_names": df.columns.tolist()},
            )
        )
        if 'age' in df.columns:
            suite.add_expectation(
                ge.core.ExpectationConfiguration(  # Use ge.core.ExpectationConfiguration
                    expectation_type="expect_column_values_to_be_between",
                    kwargs={
                        "column": "age",
                        "min_value": 0,
                        "max_value": 150,
                    },
                )
            )
        if 'income' in df.columns:
            suite.add_expectation(
                ge.core.ExpectationConfiguration(  # Use ge.core.ExpectationConfiguration
                    expectation_type="expect_column_values_to_not_be_null",
                    kwargs={"column": "income"},
                )
            )

        context.save_expectation_suite(suite, expectation_suite_name=expectation_suite_name) #save the suite

        # Create a BatchRequest
        batch_request = ge.datasource.Datasource.create_batch_request(
            datasource_name="pandas",  # Use the name 'pandas'
            data_asset_name="my_data",  # You can choose any name
            pandas_df=df,  # Pass the DataFrame here
        )

        # 4. Validate data against the expectation suite
        validator = context.get_validator(
            batch_request=batch_request,
            expectation_suite_name=expectation_suite_name,
        )
        validation_results = validator.validate()

        # Print the validation results
        st.header("Great Expectations Validation Results")
        st.json(validation_results)

        # 5. Check if the validation was successful
        if validation_results["success"]:
            st.success("Data validation with Great Expectations was successful!")
            return True  # Return True for success
        else:
            st.error("Data validation with Great Expectations failed!")
            return False  # Return False for failure

    except FileNotFoundError:
        st.error(f"Error: File not found at {file_path}")
        return False
    except Exception as e:
        st.error(f"An error occurred: {e}")
        return False



def send_email_alert(subject, body):
    """
    Sends an email alert.

    Args:
        subject (str): The subject of the email.
        body (str): The body of the email.
    """
    sender_email = "your_email@example.com"  # Replace with your email address
    receiver_email = "recipient_email@example.com"  # Replace with the recipient's email address
    smtp_server = "smtp.example.com"  # Replace with your SMTP server address
    smtp_port = 587  # Replace with your SMTP server port (e.g., 587 for TLS)
    smtp_username = "your_email@example.com"  # Replace with your email username
    smtp_password = "your_email_password"  # Replace with your email password or an app password

    msg = MIMEText(body)
    msg["Subject"] = subject
    msg["From"] = sender_email
    msg["To"] = receiver_email

    try:
        server = smtplib.SMTP(smtp_server, smtp_port)
        server.starttls()
        server.login(smtp_username, smtp_password)
        server.sendmail(sender_email, [receiver_email], msg.as_string())
        server.quit()
        logger.info("Email alert sent successfully.")
    except Exception as e:
        logger.error(f"Error sending email alert: {e}")



def detect_anomalies(df):
    """
    Detects anomalies in the given DataFrame using Isolation Forest.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: A DataFrame containing the detected anomalies, or an empty DataFrame if no anomalies are found.
    """
    # Handle missing values using imputation (replace NaN with the mean of the column)
    df_imputed = df.copy()  # Create a copy to avoid modifying the original DataFrame
    for col in df_imputed.columns:
        if df_imputed[col].isnull().any():
            mean_val = df_imputed[col].mean()
            df_imputed[col] = df_imputed[col].fillna(mean_val)

    # Initialize and fit the Isolation Forest model
    model = IsolationForest(contamination='auto', random_state=42)  # Set random_state for reproducibility
    model.fit(df_imputed)

    # Predict anomalies (returns 1 for inliers, -1 for outliers)
    anomaly_labels = model.predict(df_imputed)

    # Get the anomaly data points
    anomaly_data = df[anomaly_labels == -1]
    return anomaly_data



def monitor_data_quality(file_path):
    """
    Monitors data quality in real-time (simulated) by repeatedly profiling
    data with Great Expectations and using AI (Isolation Forest) to detect anomalies.
    Sends alerts if quality drops or anomalies are detected.

    Args:
        file_path (str): The path to the CSV file.
    """
    while True:
        # Simulate data updates
        try:
            df = pd.read_csv(file_path)
            # Introduce random changes
            if random.random() < 0.2:
                if 'age' in df.columns:
                    df['age'] = df['age'].apply(lambda x: x + random.randint(-5, 5))
                if 'income' in df.columns:
                    df['income'] = df['income'].apply(lambda x: x * (1 + random.uniform(-0.1, 0.1)) if pd.notnull(x) else None)
            df.to_csv(file_path, index=False)
            logger.info(f"Simulated data update at {datetime.datetime.now()}")
        except Exception as e:
            logger.error(f"Error reading or updating data: {e}")
            continue  # Go to the next iteration of the loop

        # 1. Basic AI Models for Monitoring
        #   - Train a simple anomaly detection model using Isolation Forest.
        anomaly_data = detect_anomalies(df)

        # 2. Use a simple custom function based AI logic for outlier detection.
        #    (Example: check if age is outside of expected range)
        anomalous_age_data = df[(df['age'] < 0) | (df['age'] > 120)] #0-120
        if not anomalous_age_data.empty:
            logger.warning(f"Outliers detected in 'age' column: {anomalous_age_data.to_string()}")

        # Profile data with Great Expectations
        validation_status = profile_data_with_great_expectations(file_path)

        # 3. Creating a monitoring function that utilizes a pre-trained machine learning model.
        #    (This is already happening with the use of detect_anomalies)
        if not validation_status or not anomaly_data.empty or not anomalous_age_data.empty: #send if GE validation fails or anomalies detected
            subject = "Data Quality and Anomaly Alert"
            body = f"Data quality check failed at {datetime.datetime.now()}.  Please investigate the data in {file_path}."
            if not validation_status:
                body += "\n\nGreat Expectations validation failed."
            if not anomaly_data.empty:
                body += "\n\nAnomalies detected by Isolation Forest:\n"
                body += anomaly_data.to_string()
            if not anomalous_age_data.empty:
                body += "\n\nAnomalies detected in age column:\n"
                body += anomalous_age_data.to_string()
            send_email_alert(subject, body)
            logger.error(f"Data quality check failed!")
        else:
            logger.info("Data quality check successful.")

        time.sleep(60)

if __name__ == "__main__":
    # Get the file path from the user using st.text_input
    default_file_path = "data.csv"  # Replace with your default CSV file name
    file_path = st.text_input("Enter the path to your CSV file:", default_file_path)

    # Create a sample CSV file if it doesn't exist
    if not pd.io.common.file_exists(file_path):
        df = pd.DataFrame({
            'age': [20, 30, 40, 50, 60],
            'income': [50000, 60000, 70000, 80000, 90000]
        })
        df.to_csv(file_path, index=False)
        st.info(f"Created a sample CSV file at {file_path}.  You can replace this with your own data.")

    # Profile data with Pandas-Profiling
    st.header("Data Profiling with Pandas-Profiling")
    profile_data_with_pandas_profiling(file_path)

    # Profile data with Great Expectations
    st.header("Data Profiling with Great Expectations")
    profile_data_with_great_expectations(file_path)

    # Start real-time monitoring in a separate thread
    monitoring_thread = threading.Thread(target=monitor_data_quality, args=(file_path,))
    monitoring_thread.daemon = True  # Allow the main thread to exit
    monitoring_thread.start()

    st.subheader("Real-time Data Quality Monitoring")
    st.write("Data quality is being monitored in the background.  "
             "Check the console for log messages and email alerts (if any issues are detected).")

    # Keep the main thread alive.
    while True:
        time.sleep(600)
        st.info("The main thread is still alive. Monitoring is running in the background...")


ModuleNotFoundError: No module named 'pandas_profiling'