In [3]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from pykalman import KalmanFilter
import joblib
import pickle

# Load AIS data with extracted features
vessel_data = pd.read_csv('Generated_csv/vessel_data_with_features.csv')

### Moving Average Anomaly Detection ###
def moving_avg_anomaly_detection(df, column, window=5, threshold=2):
    moving_avg_column = f'{column}_moving_avg'
    anomaly_column = f'{column}_anomaly_moving_avg'
    
    # Calculate moving average
    df[moving_avg_column] = df[column].rolling(window=window).mean()
    
    # Calculate anomalies based on deviations from the moving average
    df[anomaly_column] = np.abs(df[column] - df[moving_avg_column]) > threshold * df[column].std()
    
    return df

# Apply Moving Average Anomaly Detection for SOG and COG
vessel_data = moving_avg_anomaly_detection(vessel_data, 'SOG')
vessel_data = moving_avg_anomaly_detection(vessel_data, 'COG')

### Kalman Filter Anomaly Detection ###
def kalman_filter_anomaly_detection(df, column):
    kf = KalmanFilter(initial_state_mean=0, n_dim_obs=1)
    state_means, _ = kf.em(df[column].values).filter(df[column].values)
    
    # Save Kalman Filter parameters
    with open(f'{column}_kalman_filter.pkl', 'wb') as f:
        pickle.dump(kf, f)
    
    df[f'{column}_kalman'] = state_means
    df[f'{column}_anomaly_kalman'] = np.abs(df[column] - state_means) > 2 * df[column].std()
    
    return df

# Apply Kalman Filter for SOG
vessel_data = kalman_filter_anomaly_detection(vessel_data, 'SOG')

### DBSCAN Anomaly Detection ###
def kalman_filter_anomaly_detection(df, column):
    kf = KalmanFilter(initial_state_mean=df[column].iloc[0], n_dim_obs=1)
    state_means, _ = kf.smooth(df[column].values)
    
    # Flatten the state_means to ensure it's 1-dimensional
    state_means = state_means.ravel()
    
    # Save the Kalman filter state means
    df[f'{column}_kalman'] = state_means

    # Calculate anomalies
    df[f'{column}_anomaly_kalman'] = np.abs(df[column] - state_means) > 2 * df[column].std()

    return df

# Apply DBSCAN
features = vessel_data[['SOG', 'COG', 'LAT', 'LON']].values
vessel_data = dbscan_anomaly_detection(vessel_data, features)

### Isolation Forest Anomaly Detection ###
def isolation_forest_anomaly_detection(df, features, contamination=0.01):
    clf = IsolationForest(contamination=contamination)
    df['anomaly_isolation_forest'] = clf.fit_predict(features) == -1
    
    # Save Isolation Forest model
    joblib.dump(clf, 'isolation_forest_model.joblib')
    
    return df

# Apply Isolation Forest
vessel_data = isolation_forest_anomaly_detection(vessel_data, features)

### Autoencoder Anomaly Detection ###
def autoencoder_anomaly_detection(df, features, encoding_dim=2, epochs=50, batch_size=32):
    input_dim = features.shape[1]
    input_layer = Input(shape=(input_dim,))
    encoder = Dense(encoding_dim, activation='relu')(input_layer)
    decoder = Dense(input_dim, activation='linear')(encoder)
    autoencoder = Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer='adam', loss='mse')
    
    # Normalize features
    mean = features.mean(axis=0)
    std = features.std(axis=0)
    features_norm = (features - mean) / std
    
    # Train the autoencoder
    autoencoder.fit(features_norm, features_norm, epochs=epochs, batch_size=batch_size, shuffle=True, validation_split=0.1)
    
    # Save the autoencoder model
    autoencoder.save('autoencoder_model.h5')
    
    # Calculate reconstruction error
    reconstructions = autoencoder.predict(features_norm)
    mse = np.mean(np.power(features_norm - reconstructions, 2), axis=1)
    df['anomaly_autoencoder'] = mse > np.percentile(mse, 95)
    
    return df

# Apply Autoencoder
vessel_data = autoencoder_anomaly_detection(vessel_data, features)

### Z-Score Anomaly Detection ###
def zscore_anomaly_detection(df, column, threshold=2):
    zscore_column = f'{column}_zscore'
    anomaly_column = f'{column}_zscore_anomaly'
    
    # Calculate z-score
    df[zscore_column] = (df[column] - df[column].mean()) / df[column].std()
    
    # Identify anomalies based on z-score
    df[anomaly_column] = np.abs(df[zscore_column]) > threshold
    
    return df

# Apply Z-Score Anomaly Detection for SOG and COG
vessel_data = zscore_anomaly_detection(vessel_data, 'SOG')
vessel_data = zscore_anomaly_detection(vessel_data, 'COG')

### Combine Anomalies ###
def combine_anomalies(df):
    required_columns = [
        'SOG_zscore_anomaly', 'COG_zscore_anomaly',
        'SOG_anomaly_moving_avg', 'COG_anomaly_moving_avg', 
        'SOG_anomaly_kalman', 'anomaly_dbscan', 
        'anomaly_isolation_forest', 'anomaly_autoencoder'
    ]
    
    # Ensure all required columns exist, if not create them with False as default
    for col in required_columns:
        if col not in df.columns:
            df[col] = False
    
    # Combine anomalies using a voting mechanism
    df['combined_anomaly_vote'] = (
        df[required_columns].sum(axis=1) >= 3
    )
    
    # Weighted score approach
    weights = {
        'SOG_zscore_anomaly': 1.0,
        'COG_zscore_anomaly': 1.0,
        'SOG_anomaly_moving_avg': 1.5,
        'COG_anomaly_moving_avg': 1.5,
        'SOG_anomaly_kalman': 2.0,
        'anomaly_dbscan': 2.0,
        'anomaly_isolation_forest': 2.0,
        'anomaly_autoencoder': 2.0
    }
    df['combined_anomaly_weighted'] = (
        df[list(weights.keys())] * pd.Series(weights)
    ).sum(axis=1)
    
    return df

# Combine anomalies
vessel_data = combine_anomalies(vessel_data)

# Save the combined results to a CSV file
vessel_data.to_csv('vessel_data_combined_anomalies1.csv', index=False)
print("Combined anomaly detection results have been saved to 'vessel_data_combined_anomalies.csv'")


ValueError: Data must be 1-dimensional, got ndarray of shape (8877, 8877) instead