In [157]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import optimizers
import pandas as pd
import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample


# 1. Load and Combine Data

In [158]:
# Load attack and ambient datasets
can_data = pd.read_csv(r"G:\road\signal_extractions\attacks\correlated_signal_attack_1_masquerade.csv")

# 2. Preprocess Missing Values

In [159]:
can_data.fillna(0, inplace=True)  # Replace NaNs with 0

# 3. Normalize Time

In [160]:
# Convert timestamps to uniform intervals (e.g., 10 Hz)
signal_columns = [col for col in can_data.columns if "Signal" in col]  # Identify all signal columns

In [161]:
# Check the time differences between each row for sampling rate
time_diffs = can_data['Time'].diff().dropna()
print(time_diffs.describe()) 

count    7.415100e+04
mean     4.464114e-04
std      6.610735e-04
min      1.192093e-07
25%      9.536743e-07
50%      1.907349e-06
75%      1.006067e-03
max      4.119039e-03
Name: Time, dtype: float64


In [162]:
# Ensure 'Time' column is in datetime format
can_data['Time'] = pd.to_datetime(can_data['Time'], unit='s')

In [163]:
# Sort data by time
can_data = can_data.sort_values('Time')

In [164]:
# Set 'Time' as the index
can_data.set_index('Time', inplace=True)

In [165]:
# Ensure signal columns are numeric
can_data[signal_columns] = can_data[signal_columns].apply(pd.to_numeric, errors='coerce')

In [166]:
# Resample at 100µs (0.1ms) to retain more details
can_data = can_data.resample('100us').mean()

In [167]:
# Interpolate missing values using cubic interpolation for smoother transitions
can_data = can_data.interpolate(method='cubic')

In [168]:
# Reset the index after resampling
can_data.reset_index(inplace=True)

# 4. Feature Engineering

In [169]:
print(can_data)

                             Time  Label           ID  Signal_1_of_ID  \
0      1970-01-01 00:00:00.000000    0.0   852.000000    32808.000000   
1      1970-01-01 00:00:00.000100    0.0   897.633224    26911.493851   
2      1970-01-01 00:00:00.000200    0.0   923.437535    21708.051223   
3      1970-01-01 00:00:00.000300    0.0   931.218924    17156.830057   
4      1970-01-01 00:00:00.000400    0.0   922.783381    13216.988297   
...                           ...    ...          ...             ...   
331014 1970-01-01 00:00:33.101400    0.0  1082.926392     2682.440185   
331015 1970-01-01 00:00:33.101500    0.0   865.442574     2214.631360   
331016 1970-01-01 00:00:33.101600    0.0   615.794156     1629.515787   
331017 1970-01-01 00:00:33.101700    0.0   332.480258      920.002366   
331018 1970-01-01 00:00:33.101800    0.0    14.000000       79.000000   

        Signal_2_of_ID  Signal_3_of_ID  Signal_4_of_ID  Signal_5_of_ID  \
0             0.000000       78.000000        9.0

# 5. Create Sliding Windows

In [170]:
def create_time_series(data, window_size, step_size):
    """
    Generate time-series data using a sliding window approach.
    
    Parameters:
    - data: Pandas DataFrame containing the resampled data.
    - window_size: Number of samples in each window (e.g., 100,000 for 10 seconds at 10,000 Hz).
    - step_size: Step size in samples (e.g., 10,000 for 1 second at 10,000 Hz).
    
    Returns:
    - sequences: Numpy array of signal data sequences.
    - labels: Numpy array of labels for each sequence.
    """
    sequences, labels = [], []
    for i in range(0, len(data) - window_size, step_size):
        # Extract window
        window = data.iloc[i:i + window_size]
        
        # Extract signal data for the window
        sequences.append(window[signal_columns].values)
        
        label = 1 if (window['Label'].sum() > (window_size / 2)) else 0 # Majority vote
        #label = 1 if (window['Label'].sum() > 0) else 0 # Any attack label and the window is labeled as an attack for less computational ressources
        labels.append(label)
    
    return np.array(sequences), np.array(labels)

In [171]:
#  Adjusted parameters for higher frequency data (100µs, 10,000 Hz)
sampling_rate = 10000  # Hz (I changed this to 1000 Hz for less computation time during code testing)
window_size = 10 #(0.5 * sampling_rate)  # 10 seconds = 100,000 samples
step_size = 1 #(0.1 * sampling_rate)  # 1 second = 10,000 samples

In [172]:
# Generate time-series data
X, y = create_time_series(can_data, window_size, step_size)

# 6. Split Data into Train/Test Sets

In [173]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [174]:
# Replace NaN or inf values with 0
X_train = np.nan_to_num(X_train)
X_test = np.nan_to_num(X_test)

In [175]:
# Fit and transform training data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_test = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)

# Adressing data imbalance

In [176]:
"""" ideas...

# Combine features and labels into a single DataFrame
train_data = pd.DataFrame(X_train.reshape(X_train.shape[0], -1))
train_data['label'] = y_train

# Separate majority and minority classes
majority = train_data[train_data['label'] == 0]
minority = train_data[train_data['label'] == 1]

# Oversample the minority class
minority_oversampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)

# Combine back the oversampled dataset
balanced_train_data = pd.concat([majority, minority_oversampled])
X_train = balanced_train_data.drop('label', axis=1).values.reshape(-1, X_train.shape[1], X_train.shape[2])
y_train = balanced_train_data['label'].values
"""

'" ideas...\n\n# Combine features and labels into a single DataFrame\ntrain_data = pd.DataFrame(X_train.reshape(X_train.shape[0], -1))\ntrain_data[\'label\'] = y_train\n\n# Separate majority and minority classes\nmajority = train_data[train_data[\'label\'] == 0]\nminority = train_data[train_data[\'label\'] == 1]\n\n# Oversample the minority class\nminority_oversampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)\n\n# Combine back the oversampled dataset\nbalanced_train_data = pd.concat([majority, minority_oversampled])\nX_train = balanced_train_data.drop(\'label\', axis=1).values.reshape(-1, X_train.shape[1], X_train.shape[2])\ny_train = balanced_train_data[\'label\'].values\n'

# Build LSTM Model (old approach)

In [177]:
"""
model = Sequential([
    LSTM(64, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
    Dropout(0.3),
    LSTM(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output: Trust score (0 or 1 for binary classification)
])"
"""

'\nmodel = Sequential([\n    LSTM(64, activation=\'relu\', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),\n    Dropout(0.3),\n    LSTM(32, activation=\'relu\'),\n    Dense(1, activation=\'sigmoid\')  # Output: Trust score (0 or 1 for binary classification)\n])"\n'

# Build CNN + LSTM Model

In [178]:
model = Sequential([
    # CNN Feature Extraction
    Conv1D(filters=64, kernel_size=3, activation='relu', padding='same', input_shape=(X_train.shape[1], X_train.shape[2])),
    MaxPooling1D(pool_size=2),  # Downsample to reduce dimensionality
    Dropout(0.3),  # Prevent overfitting

    # LSTM for Sequential Analysis
    LSTM(64, return_sequences=True),
    Dropout(0.3),
    LSTM(32),
    
    # Fully Connected Layer
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Binary classification (0 = normal, 1 = attack)
])

In [179]:
# Compile the model
optimizer = Adam(learning_rate=0.001, clipnorm=1.0) # add clipvalue=0.5 to avoid exploding gradients
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy','Precision','Recall'])

In [180]:
# Train the model

history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [181]:
# Evaluate the model
"""
test_loss, test_acc = model.evaluate(X_test)
print(f"Test Accuracy: {test_acc}")
preds = model.predict(X_test[:10])
print(preds)"
"""


'\ntest_loss, test_acc = model.evaluate(X_test)\nprint(f"Test Accuracy: {test_acc}")\npreds = model.predict(X_test[:10])\nprint(preds)"\n'

# Detect Anomalies

In [182]:
print(np.unique(y_train, return_counts=True)) # Check class distribution and balance
print(np.unique(y_test, return_counts=True)) # Check class distribution and balance
print(X_train.shape)  # Should be (num_samples, timesteps, num_features)
print(X_test.shape)   # Should match training dimensions

(array([0, 1]), array([256192,   8615], dtype=int64))
(array([0, 1]), array([64107,  2095], dtype=int64))
(264807, 10, 22)
(66202, 10, 22)


In [183]:
print("Training Labels Distribution:", np.unique(y_train, return_counts=True))
print("Validation Labels Distribution:", np.unique(y_test, return_counts=True))


Training Labels Distribution: (array([0, 1]), array([256192,   8615], dtype=int64))
Validation Labels Distribution: (array([0, 1]), array([64107,  2095], dtype=int64))
