## Using AI for Anomalies Detection in Data Quality
**Description**: Implement an AI-based approach to detect anomalies in data quality.

**Steps**:
1. Use an Anomaly Detection Algorithm:
    - Use sklearn's Isolation Forest for anomaly detection.

**Example data:**

data = np.array([[25, 50000], [30, 60000], [35, 75000], [40, None], [45, 100000]])

2. Integrate with Great Expectations:
    - Generate alerts if anomalies are detected:

In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
import great_expectations as ge

# Sample data with a missing value (None)
data = np.array([[25, 50000], [30, 60000], [35, 75000], [40, None], [45, 100000]])

# Convert to DataFrame for convenience
df = pd.DataFrame(data, columns=["Age", "Salary"])

# Step 1: Handle missing data (simple imputation: fill missing with median)
df["Salary"] = pd.to_numeric(df["Salary"], errors='coerce')
salary_median = df["Salary"].median()
df["Salary"].fillna(salary_median, inplace=True)

print("Data after imputation:")
print(df)

# Step 2: Anomaly detection using Isolation Forest
model = IsolationForest(contamination=0.2, random_state=42)
model.fit(df)

# Predict anomalies (-1 means anomaly, 1 means normal)
df['anomaly'] = model.predict(df)

print("\nAnomaly predictions (-1=anomaly, 1=normal):")
print(df[['Age', 'Salary', 'anomaly']])

# Step 3: Use Great Expectations PandasDataset for expectations
def create_ge_dataset(df):
    ge_df = ge.from_pandas(df)

    ge_df.expect_column_to_exist("Age")
    ge_df.expect_column_values_to_not_be_null("Age")
    ge_df.expect_column_to_exist("Salary")
    ge_df.expect_column_values_to_not_be_null("Salary")

    return ge_df

# Validate data quality expectations
def validate_data(ge_df):
    results = ge_df.validate()
    return results["success"]

# Main alert function
def alert_if_anomaly_detected(df):
    if (df['anomaly'] == -1).any():
        print("🚨 ALERT: Anomalies detected in data quality!")
        anomalies = df[df['anomaly'] == -1]
        print(anomalies)
    else:
        print("Data quality is normal — no anomalies detected.")

def main():
    ge_df = create_ge_dataset(df)
    is_data_quality_good = validate_data(ge_df)
    if not is_data_quality_good:
        print("Basic data quality checks failed!")
    else:
        print("Basic data quality checks passed.")

    alert_if_anomaly_detected(df)

if __name__ == "__main__":
    main()

Data after imputation:
  Age    Salary
0  25   50000.0
1  30   60000.0
2  35   75000.0
3  40   67500.0
4  45  100000.0

Anomaly predictions (-1=anomaly, 1=normal):
  Age    Salary  anomaly
0  25   50000.0        1
1  30   60000.0        1
2  35   75000.0        1
3  40   67500.0        1
4  45  100000.0       -1


AttributeError: module 'great_expectations' has no attribute 'from_pandas'