In [2]:
# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import json
import warnings
import os

# Machine Learning libraries
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
from sklearn.model_selection import train_test_split

# Time Series libraries
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet

# Deep Learning (if needed)
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Input

# Visualization
import plotly.express as px
import plotly.graph_objects as go

# Set global parameters
plt.style.use('seaborn-whitegrid')
sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Check TensorFlow version
print(f"TensorFlow version: {tf.__version__}")


ImportError: DLL load failed while importing _pywrap_tfe: The specified procedure could not be found.

In [None]:
# Define paths
DATA_DIR = '../../data/'
SYNTHETIC_DATA_PATH = os.path.join(DATA_DIR, 'synthetic_transactions.csv')
NSE_DATA_PATH = os.path.join(DATA_DIR, 'nse_historical.csv')
MPESA_PATTERNS_PATH = os.path.join(DATA_DIR, 'mpesa_patterns.csv')

# Function to check if data exists, otherwise generate synthetic data
def get_transaction_data(path=SYNTHETIC_DATA_PATH, n_samples=10000, generate_if_missing=True):
    """Load transaction data or generate synthetic data if file doesn't exist"""
    if os.path.exists(path):
        print(f"Loading data from {path}")
        return pd.read_csv(path)
    elif generate_if_missing:
        print(f"Generating synthetic data...")
        return generate_synthetic_transactions(n_samples, path)
    else:
        raise FileNotFoundError(f"Data file not found at {path}")

# Generate synthetic transaction data with known anomalies
def generate_synthetic_transactions(n_samples=10000, save_path=None):
    """Generate synthetic transaction data with embedded anomalies"""
    np.random.seed(42)
    
    # Create date range
    start_date = datetime(2024, 1, 1)
    end_date = datetime(2025, 2, 28)
    dates = [start_date + timedelta(days=x) for x in range((end_date - start_date).days)]
    
    # Transaction types
    transaction_types = ['deposit', 'withdrawal', 'transfer', 'payment', 'loan_repayment']
    
    # Generate normal transactions
    data = {
        'transaction_id': [f'TXN{i:06d}' for i in range(n_samples)],
        'date': np.random.choice(dates, n_samples),
        'customer_id': np.random.randint(1000, 9999, n_samples),
        'transaction_type': np.random.choice(transaction_types, n_samples, 
                                            p=[0.3, 0.25, 0.2, 0.15, 0.1]),
        'amount': np.random.lognormal(mean=8, sigma=1, size=n_samples),  # KES amounts
        'location': np.random.choice(['Nairobi', 'Mombasa', 'Kisumu', 'Nakuru', 'Eldoret'], n_samples),
        'is_anomaly': np.zeros(n_samples, dtype=int)  # 0 = normal transaction
    }
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    # Format amount to be more realistic (Kenyan Shillings)
    df['amount'] = df['amount'].round(2)
    
    # Generate anomalies (5% of data)
    anomaly_count = int(n_samples * 0.05)
    anomaly_indices = np.random.choice(n_samples, anomaly_count, replace=False)
    
    # Anomaly type 1: Unusually large transactions
    large_txn_indices = anomaly_indices[:anomaly_count//3]
    df.loc[large_txn_indices, 'amount'] = df.loc[large_txn_indices, 'amount'] * np.random.uniform(10, 20, len(large_txn_indices))
    df.loc[large_txn_indices, 'is_anomaly'] = 1
    
    # Anomaly type 2: Unusual transaction frequency
    freq_anomaly_customers = df.loc[anomaly_indices[anomaly_count//3:2*anomaly_count//3], 'customer_id'].unique()
    for customer in freq_anomaly_customers:
        # Add multiple transactions in short time period
        customer_idx = df[df['customer_id'] == customer].index[0]
        anomaly_date = df.loc[customer_idx, 'date']
        
        # Mark original transaction as anomaly
        df.loc[customer_idx, 'is_anomaly'] = 1
    
    # Anomaly type 3: Transactions from unusual locations
    location_anomaly_indices = anomaly_indices[2*anomaly_count//3:]
    df.loc[location_anomaly_indices, 'location'] = 'International'
    df.loc[location_anomaly_indices, 'is_anomaly'] = 1
    
    # Convert date to string for CSV storage
    df['date'] = df['date'].dt.strftime('%Y-%m-%d')
    
    # Save to file if path provided
    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        df.to_csv(save_path, index=False)
        print(f"Synthetic data saved to {save_path}")
    
    return df

# Load or generate transaction data
transactions_df = get_transaction_data()

# Display the first few rows
transactions_df.head()

In [None]:
def preprocess_transaction_data(df):
    """Preprocess transaction data for anomaly detection"""
    # Make a copy to avoid modifying the original
    df_processed = df.copy()
    
    # Convert date to datetime
    df_processed['date'] = pd.to_datetime(df_processed['date'])
    
    # Extract date features
    df_processed['day_of_week'] = df_processed['date'].dt.dayofweek
    df_processed['day_of_month'] = df_processed['date'].dt.day
    df_processed['month'] = df_processed['date'].dt.month
    df_processed['year'] = df_processed['date'].dt.year
    
    # Create transaction frequency features
    customer_txn_counts = df_processed.groupby('customer_id').size().reset_index(name='customer_txn_count')
    df_processed = pd.merge(df_processed, customer_txn_counts, on='customer_id', how='left')
    
    # One-hot encode categorical features
    df_processed = pd.get_dummies(df_processed, columns=['transaction_type', 'location'], drop_first=False)
    
    # For privacy and to focus on patterns, remove transaction_id
    if 'transaction_id' in df_processed.columns:
        df_processed.drop('transaction_id', axis=1, inplace=True)
        
    return df_processed

# Preprocess the data
processed_df = preprocess_transaction_data(transactions_df)

# Display the processed data
processed_df.head()

In [None]:
def engineer_features(df):
    """Engineer additional features for anomaly detection"""
    # Make a copy to avoid modifying the original
    df_featured = df.copy()
    
    # Transaction amount statistics per customer
    customer_amount_stats = df_featured.groupby('customer_id')['amount'].agg(['mean', 'std', 'min', 'max']).reset_index()
    customer_amount_stats.columns = ['customer_id', 'customer_mean_amount', 'customer_std_amount', 
                                     'customer_min_amount', 'customer_max_amount']
    
    # Add a small epsilon to std to avoid division by zero
    customer_amount_stats['customer_std_amount'] = customer_amount_stats['customer_std_amount'].fillna(0) + 1e-6
    
    # Merge these statistics back to the main dataframe
    df_featured = pd.merge(df_featured, customer_amount_stats, on='customer_id', how='left')
    
    # Calculate z-score of transaction amount for each customer
    df_featured['amount_zscore'] = (df_featured['amount'] - df_featured['customer_mean_amount']) / df_featured['customer_std_amount']
    
    # Calculate transaction recency (days since first transaction)
    customer_first_txn = df_featured.groupby('customer_id')['date'].min().reset_index()
    customer_first_txn.columns = ['customer_id', 'first_transaction_date']
    df_featured = pd.merge(df_featured, customer_first_txn, on='customer_id', how='left')
    df_featured['days_since_first_txn'] = (df_featured['date'] - df_featured['first_transaction_date']).dt.days
    
    # Drop intermediate columns
    df_featured.drop('first_transaction_date', axis=1, inplace=True)
    
    return df_featured

def normalize_features(df, features_to_normalize=None):
    """Normalize numerical features"""
    df_normalized = df.copy()
    
    if features_to_normalize is None:
        # Default features to normalize
        features_to_normalize = ['amount', 'customer_txn_count', 'customer_mean_amount', 
                                 'customer_std_amount', 'customer_min_amount', 'customer_max_amount',
                                 'amount_zscore', 'days_since_first_txn']
    
    # Filter to include only columns that exist in the dataframe
    features_to_normalize = [f for f in features_to_normalize if f in df_normalized.columns]
    
    if features_to_normalize:
        scaler = StandardScaler()
        df_normalized[features_to_normalize] = scaler.fit_transform(df_normalized[features_to_normalize])
    
    return df_normalized, scaler

# Engineer features
featured_df = engineer_features(processed_df)

# Normalize features
normalized_df, scaler = normalize_features(featured_df)

# Display the enriched dataset
normalized_df.head()

In [None]:
def plot_transaction_distributions(df):
    """Plot distributions of key transaction features"""
    # Set up the figure
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Transaction amount distribution
    sns.histplot(data=df, x='amount', hue='is_anomaly', bins=50, ax=axes[0, 0])
    axes[0, 0].set_title('Transaction Amount Distribution', fontsize=14)
    axes[0, 0].set_xlabel('Amount (KES)', fontsize=12)
    axes[0, 0].set_ylabel('Count', fontsize=12)
    
    # Transaction type distribution
    txn_type_counts = df.groupby(['transaction_type', 'is_anomaly']).size().unstack().fillna(0)
    txn_type_counts.plot(kind='bar', stacked=True, ax=axes[0, 1])
    axes[0, 1].set_title('Transaction Types', fontsize=14)
    axes[0, 1].set_xlabel('Transaction Type', fontsize=12)
    axes[0, 1].set_ylabel('Count', fontsize=12)
    
    # Transaction by day of week
    dow_counts = df.groupby(['day_of_week', 'is_anomaly']).size().unstack().fillna(0)
    dow_counts.plot(kind='bar', stacked=True, ax=axes[1, 0])
    axes[1, 0].set_title('Transactions by Day of Week', fontsize=14)
    axes[1, 0].set_xlabel('Day of Week (0=Monday, 6=Sunday)', fontsize=12)
    axes[1, 0].set_ylabel('Count', fontsize=12)
    
    # Location distribution
    loc_counts = df.groupby(['location', 'is_anomaly']).size().unstack().fillna(0)
    loc_counts.plot(kind='bar', stacked=True, ax=axes[1, 1])
    axes[1, 1].set_title('Transactions by Location', fontsize=14)
    axes[1, 1].set_xlabel('Location', fontsize=12)
    axes[1, 1].set_ylabel('Count', fontsize=12)
    
    plt.tight_layout()
    return fig

def plot_anomaly_zscore_distribution(df):
    """Plot Z-score distribution for normal vs anomalous transactions"""
    plt.figure(figsize=(12, 6))
    
    # Plot Z-score distributions
    sns.kdeplot(data=df[df['is_anomaly']==0], x='amount_zscore', label='Normal Transactions', fill=True)
    sns.kdeplot(data=df[df['is_anomaly']==1], x='amount_zscore', label='Anomalous Transactions', fill=True)
    
    plt.title('Z-score Distribution: Normal vs Anomalous Transactions', fontsize=14)
    plt.xlabel('Transaction Amount Z-score', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    return plt.gcf()

# Create and display distribution plots
dist_fig = plot_transaction_distributions(featured_df)
plt.show()

# Plot Z-score distributions
zscore_fig = plot_anomaly_zscore_distribution(featured_df)
plt.show()

In [None]:
def plot_correlation_matrix(df):
    """Plot correlation matrix of numerical features"""
    # Select only numeric columns
    numeric_df = df.select_dtypes(include=[np.number])
    
    # Drop the label column for correlation analysis
    if 'is_anomaly' in numeric_df.columns:
        numeric_df_no_label = numeric_df.drop('is_anomaly', axis=1)
    else:
        numeric_df_no_label = numeric_df
    
    # Calculate correlation matrix
    corr_matrix = numeric_df_no_label.corr()
    
    # Plot the correlation matrix
    plt.figure(figsize=(14, 12))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5)
    plt.title('Feature Correlation Matrix', fontsize=16)
    plt.tight_layout()
    
    return plt.gcf()

def plot_feature_vs_anomaly(df):
    """Plot relationship between main features and anomaly label"""
    # Calculate correlation with anomaly label
    numeric_df = df.select_dtypes(include=[np.number])
    anomaly_corr = numeric_df.corr()['is_anomaly'].sort_values(ascending=False)
    
    # Plot top correlating features
    plt.figure(figsize=(12, 8))
    sns.barplot(x=anomaly_corr.index[:10], y=anomaly_corr.values[:10])
    plt.title('Top Features Correlated with Anomaly Label', fontsize=14)
    plt.xlabel('Features', fontsize=12)
    plt.ylabel('Correlation Coefficient', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    
    return plt.gcf(), anomaly_corr

# Plot correlation matrix
corr_fig = plot_correlation_matrix(featured_df)
plt.show()

# Plot feature vs anomaly correlation
anomaly_corr_fig, anomaly_corr = plot_feature_vs_anomaly(featured_df)
plt.show()

# Display top correlations with anomaly label
print("Top features correlated with anomalies:")
print(anomaly_corr.head(10))


In [None]:
def detect_anomalies_zscore(df, zscore_col='amount_zscore', threshold=3.0):
    """Detect anomalies using Z-score thresholding"""
    # Make a copy to avoid modifying the original
    df_result = df.copy()
    
    # Mark anomalies based on absolute Z-score exceeding threshold
    df_result['zscore_anomaly'] = (abs(df_result[zscore_col]) > threshold).astype(int)
    
    # Count anomalies detected
    anomaly_count = df_result['zscore_anomaly'].sum()
    total_count = len(df_result)
    
    print(f"Z-score method detected {anomaly_count} anomalies ({anomaly_count/total_count:.2%} of data)")
    
    # If ground truth is available, calculate accuracy
    if 'is_anomaly' in df_result.columns:
        accuracy = (df_result['zscore_anomaly'] == df_result['is_anomaly']).mean()
        precision, recall, f1, _ = precision_recall_fscore_support(
            df_result['is_anomaly'], 
            df_result['zscore_anomaly'], 
            average='binary'
        )
        
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        
        # Confusion matrix
        cm = confusion_matrix(df_result['is_anomaly'], df_result['zscore_anomaly'])
        print("Confusion Matrix:")
        print(cm)
    
    return df_result

# Apply Z-score anomaly detection
zscore_results = detect_anomalies_zscore(featured_df)

# Visualize Z-score anomalies
plt.figure(figsize=(12, 6))
plt.scatter(zscore_results['amount'], zscore_results['amount_zscore'], 
           c=zscore_results['zscore_anomaly'], cmap='coolwarm', alpha=0.7)
plt.axhline(y=3, color='r', linestyle='--', alpha=0.5, label='Threshold (+3)')
plt.axhline(y=-3, color='r', linestyle='--', alpha=0.5, label='Threshold (-3)')
plt.colorbar(label='Anomaly')
plt.title('Z-score Anomaly Detection', fontsize=14)
plt.xlabel('Transaction Amount (KES)', fontsize=12)
plt.ylabel('Z-score', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
def detect_anomalies_isolation_forest(df, features=None, contamination=0.05):
    """Detect anomalies using Isolation Forest algorithm"""
    # Make a copy to avoid modifying the original
    df_result = df.copy()
    
    if features is None:
        # Default features to use
        numerical_cols = df_result.select_dtypes(include=[np.number]).columns.tolist()
        features = [col for col in numerical_cols if col not in ['is_anomaly', 'zscore_anomaly', 'customer_id']]
    
    # Initialize and fit the model
    iso_forest = IsolationForest(contamination=contamination, random_state=42)
    iso_forest.fit(df_result[features])
    
    # Predict anomalies
    df_result['isolation_forest_score'] = iso_forest.decision_function(df_result[features])
    df_result['isolation_forest_anomaly'] = (iso_forest.predict(df_result[features]) == -1).astype(int)
    
    # Count anomalies detected
    anomaly_count = df_result['isolation_forest_anomaly'].sum()
    total_count = len(df_result)
    
    print(f"Isolation Forest detected {anomaly_count} anomalies ({anomaly_count/total_count:.2%} of data)")
    
    # If ground truth is available, calculate accuracy
    if 'is_anomaly' in df_result.columns:
        accuracy = (df_result['isolation_forest_anomaly'] == df_result['is_anomaly']).mean()
        precision, recall, f1, _ = precision_recall_fscore_support(
            df_result['is_anomaly'], 
            df_result['isolation_forest_anomaly'], 
            average='binary'
        )
        
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        
        # Confusion matrix
        cm = confusion_matrix(df_result['is_anomaly'], df_result['isolation_forest_anomaly'])
        print("Confusion Matrix:")
        print(cm)
    
    return df_result, iso_forest

# Apply Isolation Forest anomaly detection
iforest_results, iforest_model = detect_anomalies_isolation_forest(featured_df)

# Visualize Isolation Forest anomalies
plt.figure(figsize=(12, 6))
plt.scatter(iforest_results['amount'], iforest_results['customer_mean_amount'], 
           c=iforest_results['isolation_forest_anomaly'], cmap='coolwarm', alpha=0.7)
plt.colorbar(label='Anomaly')
plt.title('Isolation Forest Anomaly Detection', fontsize=14)
plt.xlabel('Transaction Amount (KES)', fontsize=12)
plt.ylabel('Customer Average Transaction Amount (KES)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
def detect_anomalies_lof(df, features=None, n_neighbors=20, contamination=0.05):
    """Detect anomalies using Local Outlier Factor algorithm"""
    # Make a copy to avoid modifying the original
    df_result = df.copy()
    
    if features is None:
        # Default features to use
        numerical_cols = df_result.select_dtypes(include=[np.number]).columns.tolist()
        features = [col for col in numerical_cols if col not in ['is_anomaly', 'zscore_anomaly', 
                                                                'isolation_forest_anomaly', 'isolation_forest_score', 'customer_id']]
    
    # Initialize and fit the model
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
    
    # Predict anomalies
    df_result['lof_anomaly'] = (lof.fit_predict(df_result[features]) == -1).astype(int)
    
    # Get LOF scores (negative of outlier factor)
    lof_scores = -lof.negative_outlier_factor_
    df_result['lof_score'] = lof_scores
    
    # Count anomalies detected
    anomaly_count = df_result['lof_anomaly'].sum()
    total_count = len(df_result)
    
    print(f"LOF detected {anomaly_count} anomalies ({anomaly_count/total_count:.2%} of data)")
    
    # If ground truth is available, calculate accuracy
    if 'is_anomaly' in df_result.columns:
        accuracy = (df_result['lof_anomaly'] == df_result['is_anomaly']).mean()
        precision, recall, f1, _ = precision_recall_fscore_support(
            df_result['is_anomaly'], 
            df_result['lof_anomaly'], 
            average='binary'
        )
        
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        
        # Confusion matrix
        cm = confusion_matrix(df_result['is_anomaly'], df_result['lof_anomaly'])
        print("Confusion Matrix:")
        print(cm)
    
    return df_result

# Apply LOF anomaly detection
lof_results = detect_anomalies_lof(featured_df)

# Visualize LOF anomalies
plt.figure(figsize=(12, 6))
plt.scatter(lof_results['amount'], lof_results['lof_score'], 
           c=lof_results['lof_anomaly'], cmap='coolwarm', alpha=0.7)
plt.colorbar(label='Anomaly')
plt.title('Local Outlier Factor (LOF) Anomaly Detection', fontsize=14)
plt.xlabel('Transaction Amount (KES)', fontsize=12)
plt.ylabel('LOF Score', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
def create_autoencoder(input_dim, encoding_dim=10, activation='relu'):
    """Create a simple autoencoder model"""
    # Input layer
    input_layer = Input(shape=(input_dim,))
    
    # Encoder
    encoded = Dense(encoding_dim * 2, activation=activation)(input_layer)
    encoded = Dense(encoding_dim, activation=activation)(encoded)
    
    # Decoder
    decoded = Dense(encoding_dim * 2, activation=activation)(encoded)
    decoded = Dense(input_dim, activation='linear')(decoded)
    
    # Autoencoder model
    autoencoder = Model(input_layer, decoded)
    
    # Encoder model
    encoder = Model(input_layer, encoded)
    
    # Compile model
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
    
    return autoencoder, encoder

def detect_anomalies_autoencoder(df, features=None, encoding_dim=10, epochs=50, batch_size=32, threshold_percentile=95):
    """Detect anomalies using Autoencoder reconstruction error"""
    # Make a copy to avoid modifying the original
    df_result = df.copy()
    
    if features is None:
        # Default features to use
        numerical_cols = df_result.select_dtypes(include=[np.number]).columns.tolist()
        features = [col for col in numerical_cols if col not in ['is_anomaly', 'zscore_anomaly', 
                                                               'isolation_forest_anomaly', 'isolation_forest_score',
                                                               'lof_anomaly', 'lof_score', 'customer_id']]
    
    # Split data for training (use only normal transactions for training if ground truth is available)
    if 'is_anomaly' in df_result.columns:
        train_data = df_result[df_result['is_anomaly'] == 0][features].values
    else:
        # If no ground truth, use all data (less effective, but still works)
        train_data = df_result[features].values
    
    # Create and train autoencoder
    autoencoder, encoder = create_autoencoder(input_dim=len(features), encoding_dim=encoding_dim)
    
    # Train the model
    history = autoencoder.fit(
        train_data, train_data,
        epochs=epochs,
        batch_size=batch_size,
        shuffle=True,
        validation_split=0.1,
        verbose=0
    )
    
    # Plot training history
    plt.figure(figsize=(10, 5))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Autoencoder Training History', fontsize=14)
    plt.xlabel('Epoch', fontsize=12)
    plt.ylabel('Loss', fontsize=12)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # Get reconstruction error
    test_data = df_result[features].values
    reconstructions = autoencoder.predict(test_data)
    mse = np.mean(np.power(test_data - reconstructions, 2), axis=1)
    df_result['autoencoder_error'] = mse
    
    # Set threshold based on percentile of reconstruction error
    threshold = np.percentile(mse, threshold_percentile)
    df_result['autoencoder_anomaly'] = (df_result['autoencoder_error'] > threshold).astype(int)
    
    # Count anomalies detected
    anomaly_count = df_result['autoencoder_anomaly'].sum()
    total_count = len(df_result)
    
    print(f"Autoencoder detected {anomaly_count} anomalies ({anomaly_count/total_count:.2%} of data)")
    print(f"Reconstruction error threshold: {threshold:.6f}")
    
    # If ground truth is available, calculate accuracy
    if 'is_anomaly' in df_result.columns:
        accuracy = (df_result['autoencoder_anomaly'] == df_result['is_anomaly']).mean()
        precision, recall, f1, _ = precision_recall_fscore_support(
            df_result['is_anomaly'], 
            df_result['autoencoder_anomaly'], 
            average='binary'
        )
        
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        
        # Confusion matrix
        cm = confusion_matrix(df_result['is_anomaly'], df_result['autoencoder_anomaly'])
        print("Confusion Matrix:")
        print(cm)
    
    return df_result, autoencoder, encoder

# Apply Autoencoder anomaly detection
autoencoder_results, autoencoder_model, encoder_model = detect_anomalies_autoencoder(featured_df)

# Visualize Autoencoder anomalies
plt.figure(figsize=(12, 6))
plt.scatter(autoencoder_results['amount'], autoencoder_results['autoencoder_error'], 
           c=autoencoder_results['autoencoder_anomaly'], cmap='coolwarm', alpha=0.7)
plt.axhline(y=autoencoder_results['autoencoder_error'].quantile(0.95), color='r', 
           linestyle='--', alpha=0.5, label='Threshold (95th percentile)')
plt.colorbar(label='Anomaly')
plt.title('Autoencoder Anomaly Detection', fontsize=14)
plt.xlabel('Transaction Amount (KES)', fontsize=12)
plt.ylabel('Reconstruction Error', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
def generate_synthetic_nse_data(n_days=365, save_path=None):
    """Generate synthetic Nairobi Securities Exchange (NSE) data with anomalies"""
    np.random.seed(42)
    
    # Create date range
    end_date = datetime(2025, 3, 1)
    start_date = end_date - timedelta(days=n_days)
    dates = pd.date_range(start=start_date, end=end_date, freq='B')  # Business days only
    
    # Kenyan stock tickers - major companies on NSE
    tickers = ['SCOM', 'EQTY', 'KCB', 'COOP', 'SBIC', 'BAT', 'EABL']
    
    # Generate data for multiple stocks
    all_stock_data = []
    
    for ticker in tickers:
        # Base parameters
        initial_price = np.random.uniform(50, 500)  # Initial stock price (KES)
        daily_volatility = np.random.uniform(0.01, 0.03)  # Daily volatility
        drift = np.random.uniform(0.0001, 0.0005)  # Small upward drift
        
        # Generate prices using random walk with drift
        prices = [initial_price]
        returns = np.random.normal(drift, daily_volatility, len(dates) - 1)
        
        for ret in returns:
            prices.append(prices[-1] * (1 + ret))
        
        # Generate volume
        base_volume = np.random.uniform(100000, 2000000)  # Base volume
        volumes = np.random.lognormal(mean=np.log(base_volume), sigma=0.5, size=len(dates))
        
        # Introduce anomalies
        is_anomaly = np.zeros(len(dates))
        
        # Anomaly type 1: Price jumps (5-10 instances)
        num_price_anomalies = np.random.randint(5, 11)
        price_anomaly_indices = np.random.choice(range(len(dates)), num_price_anomalies, replace=False)
        
        for idx in price_anomaly_indices:
            # Create a price jump/drop (±10-20%)
            direction = np.random.choice([-1, 1])
            magnitude = np.random.uniform(0.1, 0.2)
            prices[idx] = prices[idx] * (1 + direction * magnitude)
            is_anomaly[idx] = 1
        
        # Anomaly type 2: Volume spikes (3-7 instances)
        num_volume_anomalies = np.random.randint(3, 8)
        volume_anomaly_indices = np.random.choice(
            [i for i in range(len(dates)) if i not in price_anomaly_indices],
            num_volume_anomalies, replace=False
        )
        
        for idx in volume_anomaly_indices:
            # Create a volume spike (3-10x normal)
            volumes[idx] = volumes[idx] * np.random.uniform(3, 10)
            is_anomaly[idx] = 1
        
        # Create dataframe for this stock
        stock_data = pd.DataFrame({
            'date': dates,
            'ticker': ticker,
            'open': prices,
            'high': [p * (1 + np.random.uniform(0, 0.02)) for p in prices],
            'low': [p * (1 - np.random.uniform(0, 0.02)) for p in prices],
            'close': [p * (1 + np.random.uniform(-0.01, 0.01)) for p in prices],
            'volume': volumes,
            'is_anomaly': is_anomaly
        })
        
        all_stock_data.append(stock_data)
    
    # Combine all stocks into one dataframe
    nse_data = pd.concat(all_stock_data, ignore_index=True)
    
    # Save to file if path provided
    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        nse_data.to_csv(save_path, index=False)
        print(f"Synthetic NSE data saved to {save_path}")
    
    return nse_data

# Generate or load NSE data
if os.path.exists(NSE_DATA_PATH):
    nse_data = pd.read_csv(NSE_DATA_PATH)
    print(f"Loaded NSE data from {NSE_DATA_PATH}")
else:
    nse_data = generate_synthetic_nse_data(save_path=NSE_DATA_PATH)
    print(f"Generated synthetic NSE data")

# Convert date to datetime
nse_data['date'] = pd.to_datetime(nse_data['date'])

# Display a sample of the data
nse_data.head()

In [None]:
def plot_stock_data(df, ticker='SCOM'):
    """Plot stock price and volume with anomalies highlighted"""
    # Filter data for the selected ticker
    stock_df = df[df['ticker'] == ticker].sort_values('date')
    
    # Create figure with two subplots
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10), sharex=True)
    
    # Plot price data
    ax1.plot(stock_df['date'], stock_df['close'], label='Close Price', color='blue')
    ax1.scatter(stock_df[stock_df['is_anomaly'] == 1]['date'], 
              stock_df[stock_df['is_anomaly'] == 1]['close'], 
              color='red', s=50, label='Anomaly')
    ax1.set_title(f'{ticker} Stock Price (KES)', fontsize=16)
    ax1.set_ylabel('Price (KES)', fontsize=14)
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Plot volume data
    ax2.bar(stock_df['date'], stock_df['volume'], color='blue', alpha=0.5)
    ax2.scatter(stock_df[stock_df['is_anomaly'] == 1]['date'], 
              stock_df[stock_df['is_anomaly'] == 1]['volume'], 
              color='red', s=50, label='Anomaly')
    ax2.set_title(f'{ticker} Trading Volume', fontsize=16)
    ax2.set_xlabel('Date', fontsize=14)
    ax2.set_ylabel('Volume', fontsize=14)
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    return fig

# Plot example stock data
safcom_fig = plot_stock_data(nse_data, 'SCOM')
plt.show()

equity_fig = plot_stock_data(nse_data, 'EQTY')
plt.show()

In [None]:
def detect_anomalies_arima(stock_df, price_col='close', confidence=0.95):
    """Detect anomalies in stock prices using ARIMA model"""
    # Make a copy to avoid modifying the original
    df_result = stock_df.copy()
    
    # Prepare time series data
    df_result = df_result.sort_values('date')
    ts_data = df_result[price_col]
    
    # Fit ARIMA model - find optimal order using AIC
    best_aic = float('inf')
    best_order = None
    best_model = None
    
    # Try different ARIMA parameters (simplified version for demonstration)
    for p in range(0, 3):
        for d in range(0, 2):
            for q in range(0, 3):
                try:
                    model = ARIMA(ts_data, order=(p, d, q))
                    model_fit = model.fit()
                    aic = model_fit.aic
                    
                    if aic < best_aic:
                        best_aic = aic
                        best_order = (p, d, q)
                        best_model = model_fit
                except:
                    continue
    
    if best_model is None:
        print("Could not fit ARIMA model. Using default parameters.")
        model = ARIMA(ts_data, order=(1, 1, 1))
        best_model = model.fit()
        best_order = (1, 1, 1)
    
    print(f"Best ARIMA order: {best_order}, AIC: {best_aic:.2f}")
    
    # Get predictions and confidence intervals
    predictions = best_model.fittedvalues
    resid = best_model.resid
    sigma = np.std(resid)
    
    # Calculate confidence intervals
    from scipy.stats import norm
    z_score = norm.ppf(1 - (1 - confidence) / 2)
    lower_bound = predictions - z_score * sigma
    upper_bound = predictions + z_score * sigma
    
    # Detect anomalies
    df_result['arima_prediction'] = predictions
    df_result['arima_residual'] = resid
    df_result['arima_lower_bound'] = lower_bound
    df_result['arima_upper_bound'] = upper_bound
    df_result['arima_anomaly'] = ((df_result[price_col] < df_result['arima_lower_bound']) | 
                                 (df_result[price_col] > df_result['arima_upper_bound'])).astype(int)
    
    # Count anomalies detected
    anomaly_count = df_result['arima_anomaly'].sum()
    total_count = len(df_result)
    
    print(f"ARIMA model detected {anomaly_count} anomalies ({anomaly_count/total_count:.2%} of data)")
    
    # If ground truth is available, calculate accuracy
    if 'is_anomaly' in df_result.columns:
        accuracy = (df_result['arima_anomaly'] == df_result['is_anomaly']).mean()
        precision, recall, f1, _ = precision_recall_fscore_support(
            df_result['is_anomaly'], 
            df_result['arima_anomaly'], 
            average='binary'
        )
        
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        
        # Confusion matrix
        cm = confusion_matrix(df_result['is_anomaly'], df_result['arima_anomaly'])
        print("Confusion Matrix:")
        print(cm)
    
    return df_result, best_model

# Run ARIMA anomaly detection on SCOM stock
scom_data = nse_data[nse_data['ticker'] == 'SCOM'].sort_values('date')
scom_arima_results, scom_arima_model = detect_anomalies_arima(scom_data)

# Visualize ARIMA anomaly detection results
plt.figure(figsize=(14, 7))
plt.plot(scom_arima_results['date'], scom_arima_results['close'], label='Actual Price', color='blue')
plt.plot(scom_arima_results['date'], scom_arima_results['arima_prediction'], label='ARIMA Prediction', color='green', alpha=0.7)
plt.fill_between(scom_arima_results['date'], 
                scom_arima_results['arima_lower_bound'], 
                scom_arima_results['arima_upper_bound'], 
                color='green', alpha=0.1, label='Confidence Interval')
plt.scatter(scom_arima_results[scom_arima_results['arima_anomaly'] == 1]['date'],
           scom_arima_results[scom_arima_results['arima_anomaly'] == 1]['close'],
           color='red', s=50, label='ARIMA Detected Anomaly')
plt.scatter(scom_arima_results[scom_arima_results['is_anomaly'] == 1]['date'],
           scom_arima_results[scom_arima_results['is_anomaly'] == 1]['close'],
           marker='X', color='purple', s=100, label='True Anomaly')
plt.title('ARIMA Anomaly Detection - SCOM Stock Price', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Price (KES)', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
def compare_methods(df, methods_cols, ground_truth='is_anomaly'):
    """Compare different anomaly detection methods"""
    results = []
    
    for method in methods_cols:
        if method not in df.columns:
            continue
            
        # Calculate metrics
        accuracy = (df[method] == df[ground_truth]).mean()
        precision, recall, f1, _ = precision_recall_fscore_support(
            df[ground_truth], 
            df[method], 
            average='binary'
        )
        
        # Count detected anomalies
        anomaly_count = df[method].sum()
        total_count = len(df)
        anomaly_percent = anomaly_count / total_count
        
        # Create result row
        results.append({
            'Method': method.replace('_anomaly', ''),
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'Anomalies Detected': anomaly_count,
            'Anomaly %': anomaly_percent
        })
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

def plot_method_comparison(results_df):
    """Plot comparison of different anomaly detection methods"""
    # Set up metrics to compare
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    
    # Create figure
    plt.figure(figsize=(12, 8))
    
    # Set position for bars
    x = np.arange(len(results_df['Method']))
    width = 0.2
    
    # Plot bars for each metric
    for i, metric in enumerate(metrics):
        plt.bar(x + (i - 1.5) * width, results_df[metric], width, label=metric)
    
    # Add labels and legend
    plt.xlabel('Method', fontsize=14)
    plt.ylabel('Score', fontsize=14)
    plt.title('Comparison of Anomaly Detection Methods', fontsize=16)
    plt.xticks(x, results_df['Method'])
    plt.ylim(0, 1.0)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    
    return plt.gcf()

# Compare methods for transaction data
transaction_methods = [
    'zscore_anomaly', 
    'isolation_forest_anomaly', 
    'lof_anomaly', 
    'autoencoder_anomaly'
]

transaction_comparison = compare_methods(featured_df, transaction_methods)
print("Transaction Data Anomaly Detection Method Comparison:")
display(transaction_comparison)

# Plot comparison
txn_comparison_fig = plot_method_comparison(transaction_comparison)
plt.show()

In [None]:
def create_ensemble_detector(df, methods, voting_threshold=2):
    """Create an ensemble anomaly detector by voting from multiple methods"""
    # Make a copy to avoid modifying the original
    df_result = df.copy()
    
    # Check which methods exist in the dataframe
    valid_methods = [method for method in methods if method in df_result.columns]
    
    if len(valid_methods) == 0:
        raise ValueError("No valid anomaly detection methods provided")
    
    # Sum the votes from each method
    df_result['ensemble_votes'] = df_result[valid_methods].sum(axis=1)
    
    # Mark as anomaly if votes exceed threshold
    df_result['ensemble_anomaly'] = (df_result['ensemble_votes'] >= voting_threshold).astype(int)
    
    # Count anomalies detected
    anomaly_count = df_result['ensemble_anomaly'].sum()
    total_count = len(df_result)
    
    print(f"Ensemble method detected {anomaly_count} anomalies ({anomaly_count/total_count:.2%} of data)")
    print(f"Voting threshold: {voting_threshold} out of {len(valid_methods)} methods")
    print(f"Methods used: {', '.join([m.replace('_anomaly', '') for m in valid_methods])}")
    
    # If ground truth is available, calculate accuracy
    if 'is_anomaly' in df_result.columns:
        accuracy = (df_result['ensemble_anomaly'] == df_result['is_anomaly']).mean()
        precision, recall, f1, _ = precision_recall_fscore_support(
            df_result['is_anomaly'], 
            df_result['ensemble_anomaly'], 
            average='binary'
        )
        
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        
        # Confusion matrix
        cm = confusion_matrix(df_result['is_anomaly'], df_result['ensemble_anomaly'])
        print("Confusion Matrix:")
        print(cm)
    
    return df_result

# Create ensemble detector for transaction data
ensemble_results = create_ensemble_detector(
    featured_df, 
    ['zscore_anomaly', 'isolation_forest_anomaly', 'lof_anomaly', 'autoencoder_anomaly'],
    voting_threshold=2
)

# Add ensemble to comparison
ensemble_comparison = compare_methods(
    ensemble_results, 
    transaction_methods + ['ensemble_anomaly']
)

print("\nTransaction Data Anomaly Detection Method Comparison (with Ensemble):")
display(ensemble_comparison)

# Plot comparison with ensemble
ensemble_comparison_fig = plot_method_comparison(ensemble_comparison)
plt.show()

In [None]:
import os
import numpy as np
import pandas as pd
import joblib
import json
from datetime import datetime
from tensorflow.keras.models import load_model
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
from sklearn.preprocessing import StandardScaler

# Define paths
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
MODELS_DIR = os.path.join(BASE_DIR, 'models')

class AnomalyDetectionService:
    """Service for detecting anomalies in financial data"""
    
    def __init__(self):
        """Initialize the service and load models"""
        self.models = {}
        self.load_models()
        
    def load_models(self):
        """Load all trained anomaly detection models"""
        # Load Isolation Forest model
        iforest_path = os.path.join(MODELS_DIR, 'isolation_forest_model.joblib')
        if os.path.exists(iforest_path):
            self.models['isolation_forest'] = joblib.load(iforest_path)
        
        # Load Autoencoder model
        autoencoder_path = os.path.join(MODELS_DIR, 'autoencoder_model.h5')
        if os.path.exists(autoencoder_path):
            self.models['autoencoder'] = load_model(autoencoder_path)
        
        # Load ARIMA parameters
        arima_params_path = os.path.join(MODELS_DIR, 'arima_model_params.json')
        if os.path.exists(arima_params_path):
            with open(arima_params_path, 'r') as f:
                self.models['arima_params'] = json.load(f)
        
        # Load Prophet model
        prophet_path = os.path.join(MODELS_DIR, 'prophet_model.json')
        if os.path.exists(prophet_path):
            self.models['prophet'] = Prophet.deserialize_model(prophet_path)
    
    def preprocess_transaction(self, transaction_data):
        """Preprocess a single transaction or batch of transactions"""
        # Convert to DataFrame if it's a dictionary
        if isinstance(transaction_data, dict):
            df = pd.DataFrame([transaction_data])
        else:
            df = pd.DataFrame(transaction_data)
        
        # Ensure date is datetime
        if 'date' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['date']):
            df['date'] = pd.to_datetime(df['date'])
        
        # Extract date features
        if 'date' in df.columns:
            df['day_of_week'] = df['date'].dt.dayofweek
            df['day_of_month'] = df['date'].dt.day
            df['month'] = df['date'].dt.month
            df['year'] = df['date'].dt.year
        
        # One-hot encode categorical features if present
        for col in ['transaction_type', 'location']:
            if col in df.columns and df[col].dtype == 'object':
                df = pd.get_dummies(df, columns=[col], drop_first=False)
        
        return df
    
    def detect_transaction_anomalies(self, transaction_data, methods=None, zscore_threshold=3.0):
        """Detect anomalies in transaction data"""
        # Set default methods if not specified
        if methods is None:
            methods = ['zscore', 'isolation_forest']
        
        # Preprocess data
        df = self.preprocess_transaction(transaction_data)
        
        # Calculate Z-score if needed
        if 'zscore' in methods and 'amount' in df.columns:
            # Get customer stats or use overall stats
            if 'customer_id' in df.columns and len(df['customer_id'].unique()) > 1:
                # Calculate per customer
                customer_stats = df.groupby('customer_id')['amount'].agg(['mean', 'std']).reset_index()
                df = pd.merge(df, customer_stats, on='customer_id', how='left')
                # Handle zero std
                df['std'] = df['std'].fillna(0) + 1e-6
                df['amount_zscore'] = (df['amount'] - df['mean']) / df['std']
            else:
                # Use overall stats
                mean = df['amount'].mean()
                std = df['amount'].std() + 1e-6
                df['amount_zscore'] = (df['amount'] - mean) / std
                
            # Detect anomalies based on Z-score
            df['zscore_anomaly'] = (abs(df['amount_zscore']) > zscore_threshold).astype(int)
        
        # Use Isolation Forest if available
        if 'isolation_forest' in methods and 'isolation_forest' in self.models:
            # Select features for isolation forest
            num_features = df.select_dtypes(include=[np.number]).columns.tolist()
            features = [col for col in num_features if col not in ['customer_id', 'zscore_anomaly']]
            
            if features:
                # Make prediction
                try:
                    df['isolation_forest_anomaly'] = (
                        self.models['isolation_forest'].predict(df[features]) == -1
                    ).astype(int)
                except Exception as e:
                    print(f"Error in isolation forest prediction: {e}")
                    df['isolation_forest_anomaly'] = 0
        
        # Use Autoencoder if available
        if 'autoencoder' in methods and 'autoencoder' in self.models:
            # Select features for autoencoder
            num_features = df.select_dtypes(include=[np.number]).columns.tolist()
            features = [col for col in num_features if col not in ['customer_id', 'zscore_anomaly', 'isolation_forest_anomaly']]
            
            if features:
                # Make prediction
                try:
                    reconstructions = self.models['autoencoder'].predict(df[features])
                    mse = np.mean(np.power(df[features].values - reconstructions, 2), axis=1)
                    df['autoencoder_error'] = mse
                    threshold = np.percentile(mse, 95)  # 95th percentile
                    df['autoencoder_anomaly'] = (df['autoencoder_error'] > threshold).astype(int)
                except Exception as e:
                    print(f"Error in autoencoder prediction: {e}")
                    df['autoencoder_anomaly'] = 0
        
        # Create ensemble
        anomaly_cols = [col for col in df.columns if col.endswith('_anomaly')]
        if len(anomaly_cols) > 1:
            df['ensemble_votes'] = df[anomaly_cols].sum(axis=1)
            df['ensemble_anomaly'] = (df['ensemble_votes'] >= max(1, len(anomaly_cols) // 2)).astype(int)
        elif len(anomaly_cols) == 1:
            df['ensemble_anomaly'] = df[anomaly_cols[0]]
            df['ensemble_votes'] = df[anomaly_cols[0]]
        else:
            df['ensemble_anomaly'] = 0
            df['ensemble_votes'] = 0
        
        # Prepare result
        result = {
            'is_anomaly': bool(df['ensemble_anomaly'].iloc[0]) if len(df) == 1 else df['ensemble_anomaly'].tolist(),
            'anomaly_score': float(df['ensemble_votes'].iloc[0]) if len(df) == 1 else df['ensemble_votes'].tolist(),
            'methods_used': methods,
            'details': {}
        }
        
        # Add method-specific details
        for method in methods:
            method_col = f"{method}_anomaly"
            if method_col in df.columns:
                result['details'][method] = bool(df[method_col].iloc[0]) if len(df) == 1 else df[method_col].tolist()
        
        return result
    
    def detect_market_anomalies(self, market_data, price_col='close', method='arima', confidence=0.95):
        """Detect anomalies in market price data"""
        # Convert to DataFrame if it's a dictionary
        if isinstance(market_data, dict):
            df = pd.DataFrame([market_data])
        else:
            df = pd.DataFrame(market_data)
        
        # Ensure date is datetime
        if 'date' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['date']):
            df['date'] = pd.to_datetime(df['date'])
        
        # Sort by date
        df = df.sort_values('date')
        
        # ARIMA method
        if method == 'arima' and price_col in df.columns:
            try:
                # Get ARIMA parameters if available
                if 'arima_params' in self.models:
                    order = tuple(self.models['arima_params']['order'])
                else:
                    order = (1, 1, 1)  # Default
                
                # Fit ARIMA model
                model = ARIMA(df[price_col], order=order)
                model_fit = model.fit()
                
                # Get predictions and confidence intervals
                predictions = model_fit.fittedvalues
                resid = model_fit.resid
                sigma = np.std(resid)
                
                # Calculate confidence intervals
                from scipy.stats import norm
                z_score = norm.ppf(1 - (1 - confidence) / 2)
                lower_bound = predictions - z_score * sigma
                upper_bound = predictions + z_score * sigma
                
                # Detect anomalies
                df['arima_prediction'] = predictions
                df['arima_lower_bound'] = lower_bound
                df['arima_upper_bound'] = upper_bound
                df['arima_anomaly'] = ((df[price_col] < df['arima_lower_bound']) | 
                                     (df[price_col] > df['arima_upper_bound'])).astype(int)
                
                # Prepare result
                result = {
                    'is_anomaly': bool(df['arima_anomaly'].iloc[-1]) if len(df) >= 1 else False,
                    'anomaly_dates': df.loc[df['arima_anomaly'] == 1, 'date'].dt.strftime('%Y-%m-%d').tolist(),
                    'anomaly_prices': df.loc[df['arima_anomaly'] == 1, price_col].tolist(),
                    'latest_prediction': float(df['arima_prediction'].iloc[-1]) if len(df) >= 1 else None,
                    'latest_lower_bound': float(df['arima_lower_bound'].iloc[-1]) if len(df) >= 1 else None,
                    'latest_upper_bound': float(df['arima_upper_bound'].iloc[-1]) if len(df) >= 1 else None,
                    'method': 'arima'
                }
                
                return result
            except Exception as e:
                print(f"Error in ARIMA anomaly detection: {e}")
                return {'is_anomaly': False, 'error': str(e), 'method': 'arima'}
        
        # Prophet method
        elif method == 'prophet' and price_col in df.columns and 'date' in df.columns:
            try:
                # Prepare data for Prophet
                prophet_df = df[['date', price_col]].rename(columns={'date': 'ds', price_col: 'y'})
                
                # Use loaded model or create new one
                if 'prophet' in self.models:
                    model = self.models['prophet']
                else:
                    model = Prophet(interval_width=confidence, daily_seasonality=True)
                    model.fit(prophet_df)
                
                # Make predictions
                forecast = model.predict(prophet_df[['ds']])
                
                # Merge predictions back to original data
                forecast_columns = ['ds', 'yhat', 'yhat_lower', 'yhat_upper']
                df = pd.merge(df, forecast[forecast_columns], left_on='date', right_on='ds')
                df.drop('ds', axis=1, inplace=True)
                
                # Detect anomalies
                df['prophet_anomaly'] = ((df[price_col] < df['yhat_lower']) | 
                                       (df[price_col] > df['yhat_upper'])).astype(int)
                
                # Prepare result
                result = {
                    'is_anomaly': bool(df['prophet_anomaly'].iloc[-1]) if len(df) >= 1 else False,
                    'anomaly_dates': df.loc[df['prophet_anomaly'] == 1, 'date'].dt.strftime('%Y-%m-%d').tolist(),
                    'anomaly_prices': df.loc[df['prophet_anomaly'] == 1, price_col].tolist(),
                    'latest_prediction': float(df['yhat'].iloc[-1]) if len(df) >= 1 else None,
                    'latest_lower_bound': float(df['yhat_lower'].iloc[-1]) if len(df) >= 1 else None,
                    'latest_upper_bound': float(df['yhat_upper'].iloc[-1]) if len(df) >= 1 else None,
                    'method': 'prophet'
                }
                
                return result
            except Exception as e:
                print(f"Error in Prophet anomaly detection: {e}")
                return {'is_anomaly': False, 'error': str(e), 'method': 'prophet'}
        
        else:
            return {'is_anomaly': False, 'error': f"Invalid method or missing columns", 'method': method}

    def check_unusual_patterns(self, df, column, window=5, threshold=2.0):
        """Detect unusual patterns in time series data using rolling statistics"""
        if len(df) < window * 2:
            return np.zeros(len(df), dtype=int)
        
        # Calculate rolling mean and std
        rolling_mean = df[column].rolling(window=window).mean()
        rolling_std = df[column].rolling(window=window).std()
        
        # Fill NaN values
        rolling_mean = rolling_mean.fillna(df[column].mean())
        rolling_std = rolling_std.fillna(df[column].std())
        
        # Detect anomalies using z-score
        z_scores = (df[column] - rolling_mean) / (rolling_std + 1e-6)
        anomalies = (abs(z_scores) > threshold).astype(int)
        
        return anomalies

    def analyze_user_transaction_history(self, transactions, user_id, recent_window=30):
        """Analyze a user's transaction history to detect changes in behavior"""
        if not transactions or len(transactions) < 5:
            return {"status": "insufficient_data", "message": "Not enough transaction history for analysis"}
        
        # Convert to DataFrame if it's a list of dictionaries
        if isinstance(transactions[0], dict):
            df = pd.DataFrame(transactions)
        else:
            df = transactions
        
        # Filter for specific user if provided
        if user_id is not None and 'user_id' in df.columns:
            df = df[df['user_id'] == user_id]
        
        # Ensure date is datetime
        if 'date' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['date']):
            df['date'] = pd.to_datetime(df['date'])
            
        # Sort by date
        if 'date' in df.columns:
            df = df.sort_values('date')
        
        # Calculate behavioral metrics
        results = {}
        
        # 1. Transaction frequency
        if 'date' in df.columns:
            # Get dates as list
            dates = pd.Series(df['date'])
            # Calculate gaps between transactions
            gaps = dates.diff().dt.total_seconds() / 86400  # Convert to days
            
            # Get recent activity
            if len(df) >= recent_window:
                recent_df = df.iloc[-recent_window:]
                older_df = df.iloc[:-recent_window]
                
                # Compare transaction rates
                if len(older_df) > 0:
                    older_rate = len(older_df) / (older_df['date'].max() - older_df['date'].min()).days
                    recent_rate = len(recent_df) / max(1, (recent_df['date'].max() - recent_df['date'].min()).days)
                    
                    # If recent rate is 2x older rate, flag as unusual
                    rate_change = recent_rate / (older_rate + 1e-6)
                    results['transaction_frequency'] = {
                        'is_unusual': rate_change > 2.0 or rate_change < 0.5,
                        'recent_rate': recent_rate,
                        'historical_rate': older_rate,
                        'change_factor': rate_change
                    }
        
        # 2. Transaction amounts
        if 'amount' in df.columns:
            # Calculate amount statistics
            avg_amount = df['amount'].mean()
            std_amount = df['amount'].std()
            
            # Check recent transactions for unusual amounts
            if len(df) >= recent_window:
                recent_amounts = df['amount'].iloc[-recent_window:]
                unusual_amounts = (abs(recent_amounts - avg_amount) > 2 * std_amount).sum()
                
                results['transaction_amounts'] = {
                    'is_unusual': unusual_amounts > recent_window * 0.2,  # If >20% of recent transactions are unusual
                    'unusual_count': unusual_amounts,
                    'average_amount': avg_amount,
                    'std_amount': std_amount
                }
        
        # 3. Location patterns
        if 'location' in df.columns:
            # Get common locations
            location_counts = df['location'].value_counts()
            common_locations = set(location_counts[location_counts > 1].index)
            
            # Check if recent transactions are in unusual locations
            if len(df) >= recent_window and len(common_locations) > 0:
                recent_locations = set(df['location'].iloc[-recent_window:])
                new_locations = recent_locations - common_locations
                
                results['location_patterns'] = {
                    'is_unusual': len(new_locations) > 0,
                    'new_locations': list(new_locations),
                    'common_locations': list(common_locations)
                }
        
        # Determine overall status
        unusual_markers = sum(1 for k, v in results.items() if isinstance(v, dict) and v.get('is_unusual', False))
        results['overall_status'] = {
            'is_unusual': unusual_markers > 0,
            'unusual_markers': unusual_markers,
            'total_markers': len(results),
            'confidence': unusual_markers / max(1, len(results))
        }
        
        return results

# Instantiate service for usage
anomaly_service = AnomalyDetectionService()

def detect_transaction_anomaly(transaction):
    """API function to detect anomalies in a transaction"""
    return anomaly_service.detect_transaction_anomalies(transaction)

def detect_market_anomaly(market_data, ticker=None):
    """API function to detect anomalies in market data"""
    if ticker:
        market_data = [d for d in market_data if d.get('ticker') == ticker]
    return anomaly_service.detect_market_anomalies(market_data)

def analyze_user_behavior(transactions, user_id=None):
    """API function to analyze user transaction behavior for anomalies"""
    return anomaly_service.analyze_user_transaction_history(transactions, user_id)

def get_anomaly_description(anomaly_result):
    """Generate human-readable description for an anomaly"""
    if not anomaly_result['is_anomaly']:
        return "No anomalies detected."
    
    descriptions = []
    
    # Transaction anomalies
    if 'details' in anomaly_result:
        if anomaly_result['details'].get('zscore', False):
            descriptions.append("Unusual transaction amount (significantly different from historical patterns)")
        
        if anomaly_result['details'].get('isolation_forest', False):
            descriptions.append("Transaction with unusual combination of features")
        
        if anomaly_result['details'].get('autoencoder', False):
            descriptions.append("Transaction pattern differs from normal behavior")
    
    # Market anomalies
    if 'method' in anomaly_result:
        if anomaly_result['method'] == 'arima' or anomaly_result['method'] == 'prophet':
            if anomaly_result.get('anomaly_dates', []):
                latest_anomaly = anomaly_result['anomaly_dates'][-1]
                descriptions.append(f"Unusual price movement detected on {latest_anomaly}")
    
    if not descriptions:
        descriptions.append("Potential anomaly detected (unspecified reason)")
    
    return " ".join(descriptions)

def generate_user_advice(anomaly_result):
    """Generate advice for users based on detected anomalies"""
    if not anomaly_result.get('is_anomaly', False):
        return "Your financial activity appears normal. Continue monitoring your accounts regularly."
    
    advice = []
    
    # Transaction anomalies
    if 'details' in anomaly_result:
        if anomaly_result['details'].get('zscore', False):
            advice.append("This transaction is unusually large or small compared to your typical patterns. "
                         "Ensure it was authorized by you.")
        
        if anomaly_result['details'].get('isolation_forest', False) or anomaly_result['details'].get('autoencoder', False):
            advice.append("This transaction has unusual characteristics. "
                         "Verify the transaction details and contact your bank if you don't recognize it.")
    
    # Market anomalies
    if 'method' in anomaly_result:
        if anomaly_result['method'] == 'arima' or anomaly_result['method'] == 'prophet':
            advice.append("The market is showing unusual movement. "
                         "Consider waiting for stability before making investment decisions.")
    
    # Behavioral anomalies
    if 'overall_status' in anomaly_result:
        if anomaly_result['overall_status'].get('is_unusual', False):
            if anomaly_result.get('transaction_frequency', {}).get('is_unusual', False):
                advice.append("Your transaction frequency has changed significantly. "
                             "Review your recent account activity.")
            
            if anomaly_result.get('transaction_amounts', {}).get('is_unusual', False):
                advice.append("Your recent transaction amounts are different from your usual patterns. "
                             "Ensure all transactions are authorized.")
            
            if anomaly_result.get('location_patterns', {}).get('is_unusual', False):
                advice.append("Transactions from new or unusual locations detected. "
                             "Verify these transactions and consider updating your security settings.")
    
    if not advice:
        advice.append("An unusual pattern has been detected. Review your recent activities as a precaution.")
    
    # Add general security advice
    advice.append("\nGeneral security tips: "
                 "Enable two-factor authentication, regularly update your passwords, "
                 "and monitor your accounts for unauthorized activity.")
    
    return " ".join(advice)

# Example usage
if __name__ == "__main__":
    # Example transaction
    example_transaction = {
        'date': '2025-03-01',
        'customer_id': 1234,
        'transaction_type': 'withdrawal',
        'amount': 50000,  # Unusually large amount
        'location': 'Nairobi'
    }
    
    # Detect anomaly
    result = detect_transaction_anomaly(example_transaction)
    
    # Print result
    print(f"Is anomaly: {result['is_anomaly']}")
    print(f"Description: {get_anomaly_description(result)}")
    print(f"Advice: {generate_user_advice(result)}")