## AI & Machine Learning for Data Quality
**Description**: AI and machine learning can automate and enhance data quality checks by learning patterns and identifying anomalies more effectively than static rules.

**Task 1**: Training a model to predict and flag unusual trend patterns in sales data that
deviate from historical norms.

In [None]:
# write your code from here
# write your code from here
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

def generate_sample_data(n_samples=730, anomaly_percentage=0.05):
    """
    Generate sample sales data with some anomalies
    n_samples: Number of days (2 years by default)
    anomaly_percentage: Percentage of anomalies to introduce
    """
    # Set seed for reproducibility
    np.random.seed(42)
    
    # Create date range for 2 years
    start_date = datetime(2023, 1, 1)
    dates = [start_date + timedelta(days=i) for i in range(n_samples)]
    
    # Base sales pattern with weekly seasonality and general upward trend
    weekday_effect = np.array([0.8, 1.0, 1.1, 1.2, 1.3, 1.5, 0.9])  # Mon-Sun
    weekday_indices = [d.weekday() for d in dates]
    
    # Create seasonal components
    day_of_year = np.array([d.timetuple().tm_yday for d in dates])
    yearly_cycle = 0.3 * np.sin(2 * np.pi * day_of_year / 365) + 0.2  # Yearly cycle
    
    # Trend component (gradual growth)
    trend = np.linspace(0, 0.5, n_samples)
    
    # Base sales with seasonality and trend
    base_sales = 100 + 20 * np.array([weekday_effect[idx] for idx in weekday_indices]) + 15 * yearly_cycle + 30 * trend
    
    # Add random noise
    noise = np.random.normal(0, 5, n_samples)
    sales = base_sales + noise
    
    # Add some special events (peaks)
    # Black Friday effects
    black_friday_indices = []
    for year in [2023, 2024]:
        # Approximate Black Friday as Nov 25
        bf_idx = (datetime(year, 11, 25) - start_date).days
        if 0 <= bf_idx < n_samples:
            black_friday_indices.append(bf_idx)
            sales[bf_idx] *= 2.5  # Major sales spike
            # Days before and after also affected
            for i in range(1, 4):
                if bf_idx-i >= 0:
                    sales[bf_idx-i] *= (1.2 + 0.1*i)  # Ramp up
                if bf_idx+i < n_samples:
                    sales[bf_idx+i] *= (1.3 - 0.1*i)  # Ramp down
    
    # Add holiday effects (Christmas season)
    for year in [2023, 2024]:
        christmas_start = (datetime(year, 12, 10) - start_date).days
        christmas_end = (datetime(year, 12, 26) - start_date).days
        if christmas_start >= 0 and christmas_start < n_samples:
            end_idx = min(christmas_end, n_samples-1)
            sales[christmas_start:end_idx] *= 1.8
    
    # Introduce anomalies
    n_anomalies = int(n_samples * anomaly_percentage)
    anomaly_indices = np.random.choice(range(n_samples), size=n_anomalies, replace=False)
    
    # Different types of anomalies
    for idx in anomaly_indices:
        anomaly_type = np.random.choice(['drop', 'spike', 'level_shift'])
        
        if anomaly_type == 'drop':
            sales[idx] *= np.random.uniform(0.1, 0.5)  # Sharp drop
        elif anomaly_type == 'spike':
            sales[idx] *= np.random.uniform(1.8, 3.0)  # Sharp spike
        elif anomaly_type == 'level_shift':
            # Level shift affecting multiple days
            shift_length = np.random.randint(3, 10)
            if idx + shift_length < n_samples:
                sales[idx:idx+shift_length] *= np.random.uniform(1.5, 2.0)
    
    # Create DataFrame
    df = pd.DataFrame({
        'date': dates,
        'sales': sales,
    })
    
    # Add features for detection
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['year'] = df['date'].dt.year
    
    # Create lag features
    for lag in [1, 7, 14]:
        df[f'sales_lag_{lag}'] = df['sales'].shift(lag)
    
    # Create rolling stats
    for window in [7, 14, 30]:
        df[f'sales_rolling_mean_{window}'] = df['sales'].rolling(window=window).mean()
        df[f'sales_rolling_std_{window}'] = df['sales'].rolling(window=window).std()
    
    # Create sales velocity and acceleration
    df['sales_velocity'] = df['sales'] - df['sales_lag_1']
    df['sales_acceleration'] = df['sales_velocity'] - df['sales_velocity'].shift(1)
    
    # Drop rows with NaN (first 30 days due to lag and rolling features)
    df = df.dropna().reset_index(drop=True)
    
    return df, anomaly_indices

def detect_anomalies(df, contamination=0.05):
    """
    Detect anomalies in sales data using Isolation Forest
    """
    # Select features for anomaly detection
    feature_columns = [col for col in df.columns if col not in ['date']]
    
    # Fill any remaining NaN values with column means
    df_features = df[feature_columns].fillna(df[feature_columns].mean())
    
    # Standardize the features
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_features)
    
    # Train isolation forest model
    model = IsolationForest(
        contamination=contamination,
        random_state=42,
        n_estimators=100
    )
    
    # Fit and predict
    df['anomaly'] = model.fit_predict(df_scaled)
    df['anomaly_score'] = model.decision_function(df_scaled)
    
    # Convert to binary and score (lower score = more anomalous)
    df['anomaly'] = np.where(df['anomaly'] == -1, 1, 0)
    
    return df

def get_feature_importance(df):
    """
    Calculate importance of each feature for anomaly detection
    using a Random Forest model
    """
    from sklearn.ensemble import RandomForestClassifier
    
    feature_columns = [col for col in df.columns if col not in ['date', 'anomaly', 'anomaly_score']]
    X = df[feature_columns]
    y = df['anomaly']
    
    # Train a random forest classifier
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)
    
    # Get feature importance
    importance_df = pd.DataFrame({
        'Feature': feature_columns,
        'Importance': rf.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    return importance_df

def visualize_anomalies(df):
    """
    Visualize the detected anomalies
    """
    plt.figure(figsize=(15, 8))
    
    # Plot sales
    plt.plot(df['date'], df['sales'], label='Sales', color='blue', alpha=0.7)
    
    # Highlight anomalies
    anomalies = df[df['anomaly'] == 1]
    plt.scatter(anomalies['date'], anomalies['sales'], color='red', 
                label=f'Anomalies ({len(anomalies)} detected)', s=50)
    
    plt.title('Sales Data with Detected Anomalies', fontsize=16)
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Sales', fontsize=12)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    
    return plt

def analyze_anomalies(df):
    """
    Analyze detected anomalies and provide insights
    """
    anomalies = df[df['anomaly'] == 1]
    
    # Group anomalies by day of week
    dow_distribution = anomalies['day_of_week'].value_counts().sort_index()
    dow_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    dow_distribution.index = [dow_names[i] for i in dow_distribution.index]
    
    # Group anomalies by month
    month_distribution = anomalies['month'].value_counts().sort_index()
    month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    month_distribution.index = [month_names[i-1] for i in month_distribution.index]
    
    # Calculate basic statistics for anomalies vs normal
    normal = df[df['anomaly'] == 0]
    
    stats = {
        'avg_sales_anomalies': anomalies['sales'].mean(),
        'avg_sales_normal': normal['sales'].mean(),
        'std_sales_anomalies': anomalies['sales'].std(),
        'std_sales_normal': normal['sales'].std(),
        'max_sales_anomalies': anomalies['sales'].max(),
        'min_sales_anomalies': anomalies['sales'].min(),
        'dow_distribution': dow_distribution,
        'month_distribution': month_distribution
    }
    
    return stats, anomalies

def main():
    """
    Main function to run the anomaly detection pipeline
    """
    print("Generating sample sales data...")
    df, true_anomaly_indices = generate_sample_data(n_samples=730, anomaly_percentage=0.05)
    
    print(f"Data shape: {df.shape}")
    print("\nFirst few rows of the data:")
    print(df.head())
    
    print("\nDetecting anomalies using Isolation Forest...")
    results_df = detect_anomalies(df, contamination=0.05)
    
    # Get anomaly analysis
    stats, anomalies = analyze_anomalies(results_df)
    
    print(f"\nDetected {len(anomalies)} anomalies out of {len(results_df)} data points")
    print(f"Average sales on normal days: {stats['avg_sales_normal']:.2f}")
    print(f"Average sales on anomalous days: {stats['avg_sales_anomalies']:.2f}")
    
    print("\nDay of week distribution for anomalies:")
    print(stats['dow_distribution'])
    
    print("\nMonth distribution for anomalies:")
    print(stats['month_distribution'])
    
    print("\nTop anomalies by anomaly score:")
    top_anomalies = anomalies.sort_values('anomaly_score').head(10)
    print(top_anomalies[['date', 'sales', 'anomaly_score']].to_string(index=False))
    
    # Get feature importance for the anomaly detection
    importance_df = get_feature_importance(results_df)
    print("\nFeature importance for anomaly detection:")
    print(importance_df.head(10).to_string(index=False))
    
    # Visualize the results
    plt = visualize_anomalies(results_df)
    plt.show()

if __name__ == "__main__":
    main()

**Task 2**: Using clustering algorithms to detect duplicate records where entries are not
exactly identical.

In [None]:
# write your code from here

**Task 3**: Implementing classification models to validate data based on learned
characteristics from labeled datasets.

In [None]:
# write your code from here
