In [1]:
import pandas as pd

# Load the dataset
file_path = "ar41_for_ulb_mini.csv"  
df = pd.read_csv(file_path, sep=';')

# Display the first few rows of the dataset
print("Dataset Overview:")
print(df.head())

# Display basic information about the dataset
print("\nDataset Information:")
print(df.info())

# Summary statistics for numerical columns
print("\nSummary Statistics:")
print(df.describe())

Dataset Overview:
   Unnamed: 0  mapped_veh_id       timestamps_UTC        lat       lon  \
0           0            181  2023-08-01 03:44:12  50.769818  3.872114   
1           1            143  2023-08-01 06:36:29  51.039993  3.693429   
2           2            183  2023-08-24 06:53:54  50.742203  3.602035   
3           3            177  2023-08-01 13:53:38  50.930914  5.327132   
4           4            143  2023-08-24 07:02:30  51.180773  3.575259   

   RS_E_InAirTemp_PC1  RS_E_InAirTemp_PC2  RS_E_OilPress_PC1  \
0                27.0                23.0              255.0   
1                33.0                32.0              272.0   
2                31.0                33.0              234.0   
3                35.0                38.0              220.0   
4                41.0                34.0              227.0   

   RS_E_OilPress_PC2  RS_E_RPM_PC1  RS_E_RPM_PC2  RS_E_WatTemp_PC1  \
0              238.0         794.0         801.0              83.0   
1           

Exploration:

Perform exploratory data analysis (EDA) to understand the distribution of each variable, identify patterns, and gain insights into the data.

In [None]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Explore unique values in categorical columns
print("\nUnique Values:")
for column in df.select_dtypes(include='object').columns:
    print(f"{column}: {df[column].unique()}")

# Visualize the distribution of numerical features (you can customize this based on your preferences)
import matplotlib.pyplot as plt
import seaborn as sns

# Example: Distribution of 'RS_E_InAirTemp_PC1'
plt.figure(figsize=(10, 6))
sns.histplot(df['RS_E_InAirTemp_PC1'], bins=20, kde=True)
plt.title('Distribution of RS_E_InAirTemp_PC1')
plt.xlabel('RS_E_InAirTemp_PC1')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Advanced visualizations for numerical features
import matplotlib.pyplot as plt
import seaborn as sns

# Correlation heatmap for numerical features
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

# Pair plot for selected numerical features
numerical_features = ['lat', 'lon', 'RS_E_InAirTemp_PC1', 'RS_E_OilPress_PC1', 'RS_E_RPM_PC1']
sns.pairplot(df[numerical_features])
plt.suptitle('Pair Plot of Selected Numerical Features', y=1.02)
plt.show()

# Time series plot for selected features
plt.figure(figsize=(14, 6))
plt.plot(df['timestamps_UTC'], df['RS_E_InAirTemp_PC1'], label='RS_E_InAirTemp_PC1')
plt.plot(df['timestamps_UTC'], df['RS_E_OilPress_PC1'], label='RS_E_OilPress_PC1')
plt.title('Time Series Plot of Selected Features')
plt.xlabel('Timestamp')
plt.ylabel('Values')
plt.legend()
plt.show()

In [2]:
# Multivariate outlier detection using Mahalanobis distance
from sklearn.covariance import EllipticEnvelope

# Select relevant numerical features for multivariate outlier detection
multivariate_features = ['lat', 'lon', 'RS_E_InAirTemp_PC1', 'RS_E_OilPress_PC1', 'RS_E_RPM_PC1']

# Fit Elliptic Envelope model
envelope = EllipticEnvelope()
df['multivariate_outlier'] = envelope.fit_predict(df[multivariate_features].values.reshape(-1, len(multivariate_features)))

# Visualize multivariate outliers
plt.figure(figsize=(10, 6))
sns.scatterplot(x='lon', y='lat', hue='multivariate_outlier', data=df, palette='viridis')
plt.title('Multivariate Outlier Detection')
plt.show()

# Drop multivariate outliers
df = df[df['multivariate_outlier'] != -1]


KeyboardInterrupt: 

In [None]:
# Compare the distribution of numerical features between normal and anomaly instances
selected_features = ['RS_E_InAirTemp_PC1', 'RS_E_OilPress_PC1', 'RS_E_RPM_PC1']
for feature in selected_features:
    plt.figure(figsize=(12, 6))
    sns.kdeplot(df[df['label'] == 0][feature], label='Normal', shade=True)
    sns.kdeplot(df[df['label'] == 1][feature], label='Anomaly', shade=True)
    plt.title(f'Distribution Comparison for {feature}')
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.legend()
    plt.show()


Preprocessing:

Handle missing values, outliers, or any inconsistencies in the data.
Convert timestamp strings to datetime objects for time-based analysis.

In [None]:
# Convert 'timestamps_UTC' to datetime format
df['timestamps_UTC'] = pd.to_datetime(df['timestamps_UTC'])

# Handle missing values (you can choose different strategies based on your analysis)
# Example: Fill missing numerical values with the mean
df.fillna(df.mean(), inplace=True)

# Check for outliers and consider handling them based on your analysis

# Ensure data types are appropriate for each column
# Example: Convert columns to appropriate data types
df['mapped_veh_id'] = df['mapped_veh_id'].astype('category')

# Any other preprocessing steps based on your exploration

# Save the preprocessed dataset if needed
# df.to_csv("path/to/your/preprocessed_dataset.csv", index=False)


In [3]:
# Handling Outliers using Z-score
from scipy.stats import zscore

# Example: Detect and remove outliers for 'RS_E_InAirTemp_PC1'
z_scores = zscore(df['RS_E_InAirTemp_PC1'])
outliers = (z_scores > 3) | (z_scores < -3)
df = df[~outliers]

# Feature Engineering
# Example: Create a new feature representing the hour of the day
df['hour_of_day'] = df['timestamps_UTC'].dt.hour

# Impute missing values using advanced methods (e.g., interpolation)
df.interpolate(method='linear', inplace=True)

# Encoding categorical variables if needed
# Example: One-hot encoding for 'mapped_veh_id'
df = pd.get_dummies(df, columns=['mapped_veh_id'], prefix='veh_id')

# Scaling numerical features if needed
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_columns = ['lat', 'lon', 'RS_E_InAirTemp_PC1', 'RS_E_OilPress_PC1', 'RS_E_RPM_PC1']
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Any other advanced preprocessing steps based on your analysis

# Save the advanced preprocessed dataset if needed
# df.to_csv("path/to/your/advanced_preprocessed_dataset.csv", index=False)


KeyboardInterrupt: 

In [None]:
# Extract time-related features from the timestamp
df['day_of_week'] = df['timestamps_UTC'].dt.dayofweek
df['day_of_month'] = df['timestamps_UTC'].dt.day
df['month'] = df['timestamps_UTC'].dt.month
df['year'] = df['timestamps_UTC'].dt.year

# Lag features for time series analysis
for feature in ['RS_E_InAirTemp_PC1', 'RS_E_OilPress_PC1', 'RS_E_RPM_PC1']:
    df[f'{feature}_lag_1'] = df[feature].shift(1)
    df[f'{feature}_rolling_mean'] = df[feature].rolling(window=3).mean()

# Handling imbalanced data (if applicable)
# Example: Resample to balance the number of normal and anomaly instances
from sklearn.utils import resample

# Separate normal and anomaly instances
normal_instances = df[df['label'] == 0]
anomaly_instances = df[df['label'] == 1]

# Upsample minority class (anomalies) to balance the dataset
anomaly_upsampled = resample(anomaly_instances, replace=True, n_samples=len(normal_instances), random_state=42)
df_balanced = pd.concat([normal_instances, anomaly_upsampled])

# Save the advanced preprocessed dataset if needed
# df.to_csv("path/to/your/final_dataset.csv", index=False)


In [None]:
# Use dimensionality reduction techniques like PCA to reduce feature dimensionality
from sklearn.decomposition import PCA

# Specify the number of components based on explained variance ratio
num_components = 5
pca = PCA(n_components=num_components)
pca_result = pca.fit_transform(df[selected_features])

# Visualize explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(range(1, num_components + 1), pca.explained_variance_ratio_.cumsum(), marker='o', linestyle='--')
plt.title('Explained Variance Ratio for PCA Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.show()

# Transform the dataset with the selected number of components
df_pca = pd.concat([df, pd.DataFrame(pca_result, columns=[f'PCA_{i}' for i in range(1, num_components + 1)])], axis=1)


In [None]:
# Use advanced imputation methods like KNN imputation
from sklearn.impute import KNNImputer

# Select features for imputation
features_for_imputation = ['RS_E_InAirTemp_PC1', 'RS_E_OilPress_PC1', 'RS_E_RPM_PC1']

# Initialize the imputer
imputer = KNNImputer(n_neighbors=5)

# Impute missing values using KNN imputation
df[features_for_imputation] = imputer.fit_transform(df[features_for_imputation])


Enrichment with Weather Data:
Data Source Integration:

Integrate weather data from sources like OpenWeatherMap or other available APIs.
Match weather data with timestamps from the train dataset.

In [None]:
import requests

# Function to fetch weather data from OpenWeatherMap API
def get_weather_data(api_key, lat, lon, timestamp):
    base_url = "http://api.openweathermap.org/data/2.5/onecall"
    params = {
        'lat': lat,
        'lon': lon,
        'exclude': 'current,minutely,hourly,alerts',
        'appid': api_key
    }

    response = requests.get(base_url, params=params)
    weather_data = response.json()

    # Extract relevant weather features
    if 'daily' in weather_data:
        for day in weather_data['daily']:
            # Extract desired features (e.g., temperature, humidity, etc.)
            # Add these features to the dataset based on the timestamp

    return df  # Return the enriched dataset

# Specify your OpenWeatherMap API key
api_key = "your_openweathermap_api_key"

# Iterate through rows and enrich the dataset with weather data
for index, row in df.iterrows():
    lat, lon, timestamp = row['lat'], row['lon'], row['timestamps_UTC']
    df.at[index, 'weather_enriched'] = get_weather_data(api_key, lat, lon, timestamp)

# Save the dataset with weather enrichment
# df.to_csv("path/to/your/enriched_dataset.csv", index=False)


In [None]:
import requests

# Function to fetch weather data from Dark Sky API
def get_darksky_data(api_key, lat, lon, timestamp):
    base_url = f"https://api.darksky.net/forecast/{api_key}/{lat},{lon},{timestamp}?exclude=currently,minutely,hourly,alerts"
    response = requests.get(base_url)
    weather_data = response.json()

    # Extract relevant weather features
    if 'daily' in weather_data and 'data' in weather_data['daily']:
        for day in weather_data['daily']['data']:
            # Extract desired features (e.g., temperature, humidity, etc.)
            # Add these features to the dataset based on the timestamp

    return df  # Return the enriched dataset

# Specify your Dark Sky API key
darksky_api_key = "your_darksky_api_key"

# Iterate through rows and enrich the dataset with weather data from Dark Sky
for index, row in df.iterrows():
    lat, lon, timestamp = row['lat'], row['lon'], row['timestamps_UTC']
    df.at[index, 'weather_enriched_darksky'] = get_darksky_data(darksky_api_key, lat, lon, timestamp)

# Save the dataset with Dark Sky weather enrichment
# df.to_csv("path/to/your/enriched_dataset_darksky.csv", index=False)


Developing and Comparing Anomaly Detection Methods:

Select Anomaly Detection Algorithm (Isolation Forest):
Isolation Forest Algorithm:
The Isolation Forest algorithm is an ensemble-based anomaly detection method that isolates anomalies by randomly selecting features and splitting data points. Anomalies are expected to be isolated in fewer splits, making them easier to detect.

Implementation:

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np

# Assuming your features include both original train features and weather features
# Replace this with your actual list of feature names
selected_features = ['RS_E_InAirTemp_PC1', 'RS_E_OilPress_PC1', 'RS_E_RPM_PC1', 'temperature', 'humidity']

# Select features and label for training
X = df[selected_features]
y = df['label']  # Assuming you have a 'label' column indicating normal (0) and anomaly (1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Isolation Forest model
isolation_forest_model = IsolationForest(contamination='auto', random_state=42)
isolation_forest_model.fit(X_train)

# Predict anomalies on the testing set
y_pred = isolation_forest_model.predict(X_test)

# Map predictions to 0 (normal) and 1 (anomaly)
y_pred_mapped = np.where(y_pred == -1, 1, 0)

# Evaluate the performance of the model
print("Classification Report:")
print(classification_report(y_test, y_pred_mapped))

# Evaluate AUC-ROC
auc_roc = roc_auc_score(y_test, y_pred_mapped)
print(f"AUC-ROC Score: {auc_roc}")


Enhanced Isolation Forest Implementation:
1. Feature Scaling:
Scale numerical features to ensure that all features contribute equally to the model.
This is particularly important if the features have different scales.

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the Isolation Forest model on scaled data
isolation_forest_model_scaled = IsolationForest(contamination='auto', random_state=42)
isolation_forest_model_scaled.fit(X_train_scaled)

# Predict anomalies on the scaled testing set
y_pred_scaled = isolation_forest_model_scaled.predict(X_test_scaled)
y_pred_mapped_scaled = np.where(y_pred_scaled == -1, 1, 0)


2. Grid Search for Hyperparameter Tuning (Optional):
Perform a grid search to find optimal hyperparameters for the Isolation Forest model.

In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters to search
param_grid = {'n_estimators': [50, 100, 200], 'max_samples': ['auto', 100, 200]}

# Initialize Isolation Forest model
isolation_forest_model_tuned = IsolationForest(contamination='auto', random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(isolation_forest_model_tuned, param_grid, scoring='roc_auc', cv=3)
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Use the best model for predictions on the testing set
best_model = grid_search.best_estimator_
y_pred_tuned = best_model.predict(X_test_scaled)
y_pred_mapped_tuned = np.where(y_pred_tuned == -1, 1, 0)


3. Visualization of Anomalies:
Visualize the anomalies detected by the Isolation Forest model on the testing set.

In [None]:
import matplotlib.pyplot as plt

# Assuming you have a 'timestamps_UTC' column in your testing set
timestamps_test = X_test['timestamps_UTC']

# Plot anomalies over time
plt.figure(figsize=(12, 6))
plt.scatter(timestamps_test, y_pred_mapped_scaled, c=y_pred_mapped_scaled, cmap='viridis')
plt.title('Anomalies Detected by Isolation Forest (Scaled)')
plt.xlabel('Timestamp')
plt.ylabel('Anomaly Prediction (0: Normal, 1: Anomaly)')
plt.show()


Approach 2: Local Outlier Factor (LOF)
LOF Algorithm:
The Local Outlier Factor (LOF) is a density-based anomaly detection method. It measures the local density deviation of a data point with respect to its neighbors.

Implementation:

In [None]:
from sklearn.neighbors import LocalOutlierFactor

# Initialize the LOF model
lof_model = LocalOutlierFactor(contamination='auto')

# Fit the LOF model on the training data
lof_model.fit(X_train_scaled)

# Predict anomalies on the scaled testing set
y_pred_lof = lof_model.fit_predict(X_test_scaled)
y_pred_mapped_lof = np.where(y_pred_lof == -1, 1, 0)


Approach 3: One-Class SVM
One-Class SVM Algorithm:
One-Class SVM is a support vector machine algorithm designed for binary classification with only one class of interest (inliers). It learns a boundary around normal instances.

Implementation:

In [None]:
from sklearn.svm import OneClassSVM

# Initialize the One-Class SVM model
svm_model = OneClassSVM(nu=0.05)  # You may need to adjust the 'nu' parameter based on your data

# Fit the One-Class SVM model on the training data
svm_model.fit(X_train_scaled)

# Predict anomalies on the scaled testing set
y_pred_svm = svm_model.predict(X_test_scaled)
y_pred_mapped_svm = np.where(y_pred_svm == -1, 1, 0)


Approach 4: Autoencoders
Autoencoder Algorithm:
Autoencoders are a type of artificial neural network used for unsupervised learning. They learn a compressed representation of the input data and can be used for anomaly detection by reconstructing normal instances accurately.

Implementation:

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Build the Autoencoder model
model = Sequential([
    Dense(16, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(8, activation='relu'),
    Dense(16, activation='relu'),
    Dense(X_train_scaled.shape[1], activation='sigmoid')
])

model.compile(optimizer='adam', loss='mse')

# Fit the Autoencoder model on the training data
model.fit(X_train_scaled, X_train_scaled, epochs=50, batch_size=32, validation_split=0.2)

# Reconstruct data and calculate reconstruction error
X_pred = model.predict(X_test_scaled)
reconstruction_error = np.mean(np.square(X_test_scaled - X_pred), axis=1)

# Define a threshold for anomaly detection (e.g., mean + 3 * standard deviation)
threshold = np.mean(reconstruction_error) + 3 * np.std(reconstruction_error)

# Map predictions to 0 (normal) and 1 (anomaly)
y_pred_autoencoder = np.where(reconstruction_error > threshold, 1, 0)


Evaluate Performance:

Evaluation for LOF:


In [None]:
from sklearn.metrics import confusion_matrix

# Evaluate LOF
print("Evaluation for LOF:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_mapped_lof))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_mapped_lof))
print("\nAUC-ROC Score:")
auc_roc_lof = roc_auc_score(y_test, y_pred_mapped_lof)
print(auc_roc_lof)

# Visualization for LOF
plt.figure(figsize=(12, 6))
plt.scatter(timestamps_test, y_pred_mapped_lof, c=y_pred_mapped_lof, cmap='viridis')
plt.title('Anomalies Detected by LOF')
plt.xlabel('Timestamp')
plt.ylabel('Anomaly Prediction (0: Normal, 1: Anomaly)')
plt.show()


Evaluation for One-Class SVM:

In [None]:
# Evaluate One-Class SVM
print("\nEvaluation for One-Class SVM:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_mapped_svm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_mapped_svm))
print("\nAUC-ROC Score:")
auc_roc_svm = roc_auc_score(y_test, y_pred_mapped_svm)
print(auc_roc_svm)

# Visualization for One-Class SVM
plt.figure(figsize=(12, 6))
plt.scatter(timestamps_test, y_pred_mapped_svm, c=y_pred_mapped_svm, cmap='viridis')
plt.title('Anomalies Detected by One-Class SVM')
plt.xlabel('Timestamp')
plt.ylabel('Anomaly Prediction (0: Normal, 1: Anomaly)')
plt.show()


Evaluation for Autoencoders:

In [None]:
# Evaluate Autoencoders
print("\nEvaluation for Autoencoders:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_autoencoder))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_autoencoder))
print("\nAUC-ROC Score:")
auc_roc_autoencoder = roc_auc_score(y_test, y_pred_autoencoder)
print(auc_roc_autoencoder)

# Visualization for Autoencoders
plt.figure(figsize=(12, 6))
plt.scatter(timestamps_test, y_pred_autoencoder, c=y_pred_autoencoder, cmap='viridis')
plt.title('Anomalies Detected by Autoencoders')
plt.xlabel('Timestamp')
plt.ylabel('Anomaly Prediction (0: Normal, 1: Anomaly)')
plt.show()


Model Comparison:
First, let's compare the selected anomaly detection models (Isolation Forest, LOF, One-Class SVM, and Autoencoders) based on key evaluation metrics.

In [None]:
# Evaluation for Isolation Forest (Assuming you've selected it)
print("Evaluation for Isolation Forest:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_mapped_scaled))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_mapped_scaled))
print("\nAUC-ROC Score:")
auc_roc_isolation_forest = roc_auc_score(y_test, y_pred_mapped_scaled)
print(auc_roc_isolation_forest)

Dashboard Development with Plotly and Dash:
Dashboard Features:
Time Series Chart:

Displays the anomalies over time, allowing you to observe patterns.
Geographical Distribution Map:

Represents the anomalies on a geographical map using latitude and longitude.
Metrics Summary:

Presents key evaluation metrics (precision, recall, F1-score, AUC-ROC) for a quick overview.
Additional Steps:
Precision, Recall, F1-Score, AUC-ROC:

Before running the dashboard, calculate precision, recall, F1-score, and AUC-ROC for the selected anomaly detection approach (Isolation Forest, LOF, One-Class SVM, or Autoencoders).
Replace Column Names:

Replace the placeholder column names in the code with your actual column names.
Run the Dashboard:

Run the provided Dash app code locally.
Open a web browser and navigate to http://127.0.0.1:8050/ to view the dashboard.
Dashboard Interaction (Optional):

Enhance the dashboard with additional interactive features or customizations based on your requirements.

In [None]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd

# Assuming 'timestamps_UTC', 'lat', 'lon', 'label' columns are present in your testing set
# Replace these with your actual column names
df_dashboard = pd.DataFrame({
    'Timestamp': timestamps_test,
    'Latitude': X_test['lat'],
    'Longitude': X_test['lon'],
    'Anomaly': y_test
})

# Initialize Dash app
app = dash.Dash(__name__)

# Define layout
app.layout = html.Div(children=[
    html.H1("Train Anomaly Detection Dashboard"),
    
    # Line chart of anomalies over time
    dcc.Graph(
        id='time-series-chart',
        figure=px.line(df_dashboard, x='Timestamp', y='Anomaly', title='Anomalies Over Time')
    ),
    
    # Scatter map of anomalies
    dcc.Graph(
        id='map-chart',
        figure=px.scatter_geo(
            df_dashboard, 
            lat='Latitude', 
            lon='Longitude', 
            color='Anomaly', 
            title='Geographical Distribution of Anomalies'
        )
    ),
    
    # Metrics summary
    html.Div([
        html.H3("Metrics Summary"),
        html.Div([
            html.P(f"Precision: {precision:.2f}"),
            html.P(f"Recall: {recall:.2f}"),
            html.P(f"F1-Score: {f1_score:.2f}"),
            html.P(f"AUC-ROC: {auc_roc:.2f}")
        ])
    ])
])

# Assume you have predictions for each model: y_pred_mapped_scaled, y_pred_mapped_lof, y_pred_mapped_svm, y_pred_autoencoder

# Create a DataFrame for model predictions
df_predictions = pd.DataFrame({
    'Timestamp': timestamps_test,
    'Isolation_Forest': y_pred_mapped_scaled,
    'LOF': y_pred_mapped_lof,
    'One_Class_SVM': y_pred_mapped_svm,
    'Autoencoder': y_pred_autoencoder
})

# Add a new section to the layout for model comparison
app.layout = html.Div(children=[
    html.H1("Train Anomaly Detection Dashboard"),
    
    # ... (previous layout components)
    
    # Model comparison line chart
    dcc.Graph(
        id='model-comparison-chart',
        figure=px.line(df_predictions, x='Timestamp', y=df_predictions.columns[1:],
                       title='Model Comparison - Anomalies Over Time')
    ),
    
    # Model comparison metrics summary
    html.Div([
        html.H3("Model Comparison Metrics"),
        html.Div([
            html.P(f"Isolation Forest AUC-ROC: {auc_roc_isolation_forest:.2f}"),
            # Add metrics for other models as needed
        ])
    ])
])

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)

Simulating Streaming Data:

In [None]:
import time
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

# Function to generate a new observation
def generate_observation():
    timestamp = datetime.utcnow()
    lat = np.random.uniform(50, 52)
    lon = np.random.uniform(3, 5)
    features = np.random.normal(0, 1, 14)  # Assuming 14 features for the example
    return pd.DataFrame([[timestamp, lat, lon] + list(features)],
                        columns=['timestamps_UTC', 'lat', 'lon'] +
                                [f'feature_{i}' for i in range(14)])

# Simulate streaming data
while True:
    new_observation = generate_observation()
    
    # Preprocess the new observation (assuming the same preprocessing steps as the training set)
    new_observation_scaled = scaler.transform(new_observation.iloc[:, 1:])
    
    # Predict anomaly using the selected model (assuming Isolation Forest for this example)
    new_observation_pred = isolation_forest_model_scaled.predict(new_observation_scaled)
    
    # Report anomaly if detected
    if new_observation_pred == -1:
        print(f"Anomaly detected at {new_observation['timestamps_UTC'].values[0]}")
        # Update the dashboard dynamically (e.g., using Dash callback)

    # Sleep for a short duration (simulating streaming interval)
    time.sleep(10)


Additional Steps:
Adjust Model and Features:

Replace the anomaly detection model and features in the script based on the chosen approach (Isolation Forest, LOF, One-Class SVM, or Autoencoders).
Streaming Environment:

Run the streaming data script in a separate terminal or as a background process to continuously generate new observations.
Real-Time Anomaly Detection:

As new observations are generated, the script will preprocess and predict anomalies in real-time using the selected model.
Monitoring and Logging (Optional):

Implement logging or monitoring mechanisms to track anomalies and system performance.

In [None]:
# Import necessary Dash components
from dash.dependencies import Input, Output
import plotly.graph_objs as go

# Assume 'app' is your existing Dash app

# Add a callback function to update the real-time anomaly chart
@app.callback(Output('real-time-chart', 'figure'),
              [Input('interval-component', 'n_intervals')])
def update_real_time_chart(n):
    # Fetch the latest anomalies from a shared storage or database
    latest_anomalies = get_latest_anomalies()  # Implement this function based on your storage

    # Plot the real-time anomalies
    fig = px.line(latest_anomalies, x='timestamps_UTC', y='Anomaly', title='Real-Time Anomalies')
    
    return fig

# Add an interval component for periodic updates
app.layout = html.Div(children=[
    html.H1("Train Anomaly Detection Dashboard"),

    # ... (previous layout components)

    # Real-time anomaly chart
    dcc.Graph(id='real-time-chart'),

    # Interval component for periodic updates
    dcc.Interval(
        id='interval-component',
        interval=10 * 1000,  # in milliseconds
        n_intervals=0
    )
])

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)


Shared Storage for Anomalies:

Implement a shared storage mechanism (database, cache, etc.) to store and retrieve the latest anomalies.
Streaming Data and Dashboard Integration:

Run the streaming data script and the Dash app simultaneously. The dashboard should now update in real-time when anomalies are detected.