In [None]:
# Changes AWS to True if on SageMaker Instance and set S3 BUCKET and Key accordingly
AWS = False
REGION = 'eu-west-1'
BUCKET = 'ml-can-ids-logs'
s3 = None

import pandas as pd
import numpy as np
import os
from feature_selection import feature_selection_preparation
from sklearn.ensemble import RandomForestClassifier

# Adjust pandas display options
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # No wrapping, long rows won't be cut off
pd.set_option('display.max_colwidth', None)  # Show full column content (especially useful for long strings)


# Remove this after testing/debugging
SEED = 42
np.random.seed(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'  



In [2]:
if AWS:
    s3 = get_s3_client(REGION, BUCKET, True)
    
    # Get S3 Object 
    channel2logs = s3.get_object(Bucket = BUCKET, Key= 'channel2Logs.csv')
    dos_attack_channel2 = s3.get_object(Bucket = BUCKET, Key= 'dos_attack_channel2.csv')
    replay_attack_channel2 = s3.get_object(Bucket = BUCKET, Key = 'replay_attack_channel2.csv') 
    spoofing_attack_channel2 = s3.get_object(Bucket = BUCKET, Key = 'new_spoofing_attack_channel2.csv') 

    channel2logs = channel2logs['Body'].read().decode('utf-8')
    dos_attack_channel2 = dos_attack_channel2['Body'].read().decode('utf-8')
    replay_attack_channel2 = replay_attack_channel2['Body'].read().decode('utf-8')
    spoofing_attack_channel2 = spoofing_attack_channel2['Body'].read().decode('utf-8')

    # Get Content
    channel2logs = StringIO(channel2logs)
    dos_attack_channel2 = StringIO(dos_attack_channel2)
    replay_attack_channel2 = StringIO(replay_attack_channel2)
    spoofing_attack_channel2 = StringIO(spoofing_attack_channel2)

    # Attack based on Channel
    preprocessed_DoS_channel2_csv_path = dos_attack_channel2 # DoS on channel 2 (Red Channel)
    preprocessed_Replay_channel2_csv_path = replay_attack_channel2 # Replay on channel 2 (Red Channel)
    preprocessed_Spoofing_channel2_csv_path = spoofing_attack_channel2 # Spoofing on channel 2 (Red Channel)

    # Unprocessed Channel Data
    preprocessed_normal_channel2_csv_path = channel2logs # Red Channel
    preprocessed_normal_channel4_csv_path = "" # Yellow Channel
    preprocessed_normal_channel5_csv_path = "" # Green Channel

    # Current best model
    best_model_path = ""
else:
    # Unprocessed Normal and Attack Data
    preprocessed_normal_csv_path = './Dataset/Tw22206_L003_with_ecu_channel.csv'  # Normal Unprocessed
    preprocessed_DoS_csv_path = './Dataset/Attack_Logs/dos_attack.csv'  # Dos Unprocessed
    preprocessed_Fuzzy_csv_path = './Dataset/Attack_Logs/fuzzy_attack.csv'  # Fuzzy Unprocessed
    preprocessed_Replay_csv_path = './Dataset/Attack_Logs/replay_attack.csv'  # Replay Unprocessed - Test
    preprocessed_Spoofing_csv_path = './Dataset/Attack_Logs/spoofing_attack.csv'  # Spoofing Unprocessed
    preprocessed_Suspension_csv_path = './Dataset/Attack_Logs/suspension_attack.csv'  # Suspension Unprocessed - Hardest Attack Type


    # Attack based on Channel
    preprocessed_DoS_channel2_csv_path = './Dataset/Attack_Logs/dos_attack_channel2.csv'  # DoS on channel 2 (Red Channel)
    preprocessed_Replay_channel2_csv_path = './Dataset/Attack_Logs/replay_attack_channel2.csv'  # Replay on channel 2 (Red Channel)
    preprocessed_Suspension_channel2_csv_path = './Dataset/Attack_Logs/suspension_attack_channel2.csv'  # Suspension on channel 2 (Red Channel)
    preprocessed_Spoofing_channel2_csv_path = './Dataset/Attack_Logs/spoofing_attack_channel2.csv'  # Spoofing on channel 2 (Red Channel)
    preprocessed_new_Spoofing_channel2_csv_path = './Dataset/Attack_Logs/new_spoofing_attack_channel2.csv'  # Spoofing on channel 2 (Red Channel)


    # Unprocessed Channel Data
    preprocessed_normal_channel0_csv_path = './Dataset/Channel_Logs/channel0Logs.csv'
    preprocessed_normal_channel2_csv_path = './Dataset/Channel_Logs/channel2Logs.csv'  # Red Channel
    preprocessed_normal_channel4_csv_path = './Dataset/Channel_Logs/channel4Logs.csv'  # Yellow Channel
    preprocessed_normal_channel5_csv_path = './Dataset/Channel_Logs/channel5Logs.csv'  # Green Channel


    # Preprocessed Dataframe Data
    processeddataframe_normal_csv_path = './Dataset/Processed_Dataframes/train_dataframe.csv'  # Normal CSV Dataframe (Turns Lists into Strings)
    processeddataframe_DoS_csv_path = './Dataset/Processed_Dataframes/test_DoS_dataframe.csv'  # DoS CSV Dataframe (Turns Lists into Strings)

    # Preprocessed Pickle Data
    processeddataframe_normal_pickle_path = './Dataset/Processed_Dataframes/train_Normal_dataframePickle.pkl'  # Normal Pickle Dataframe
    processeddataframe_DoS_pickle_path = './Dataset/Processed_Dataframes/test_DoS_dataframePickle.pkl'  # DoS Pickle Dataframe

    # Current best model
    best_model_path = './Code/Models/BEST_LSTM_VAE_LD30_Beta25_NT800000_21-37.keras'



    # PRELOAD Dataframe for Debug
    DEBUG = False 


In [3]:
custom_fc_parameters = {


    }

"""  
    ALL Features extracted ATM, add parameters above in 'custom_fc_parameters':

    'iat', 'msg_frequency', 'rolling_mean_iat', 'rolling_std_iat', 
    'value__binned_entropy__max_bins_10', 'value__skewness', 
    'value__autocorrelation__lag_1', 'value__agg_autocorrelation__f_agg_"mean"__maxlag_5', 
    'value__agg_linear_trend__attr_"slope"__chunk_len_5__f_agg_"mean"', 
    'value__fft_coefficient__attr_"real"__coeff_1', 
    'value__approximate_entropy__m_2__r_0.1', 'value__binned_entropy__max_bins_10', 
    'value__benford_correlation'
"""
# List of selected tsfresh features
tsfresh_features = [
   
]

In [None]:
LOAD_DATAFRAME = False
TS_FRESH = False

n_rows_train = 40000    # select how many rows to load. None if whole train datasset
n_rows_test = 40000   # select how many rows to load. None if whole test datasset
batch_size = 1024
window_size = 50    # increase window size
stride = 1         # increase stride as a buffer
split_ratio = 0.8     # % of training data to use for training
window_anomaly_ratio = 0.1   # For 1 anomaly per window do: 1 / window_size

# Preprocess and load training data
processeddataframe = feature_selection_preparation(preprocessed_normal_channel2_csv_path, 'training', rows=n_rows_train, ts_fresh= TS_FRESH, ts_fresh_parameters= tsfresh_features, ts_fresh_custom_features= custom_fc_parameters)

#processeddataframe_test = feature_selection_preparation(preprocessed_new_Spoofing_channel2_csv_path, 'test', rows=n_rows_test, ts_fresh= TS_FRESH, ts_fresh_parameters= tsfresh_features, ts_fresh_custom_features= custom_fc_parameters)
processeddataframe_test = feature_selection_preparation(preprocessed_Replay_channel2_csv_path, 'test', rows=n_rows_test, ts_fresh= TS_FRESH, ts_fresh_parameters= tsfresh_features, ts_fresh_custom_features= custom_fc_parameters)



Checking original data types in ./Dataset/Attack_Logs/replay_attack_channel2.csv...
Original inferred data types:
  timestamp: float64
  arbitration_id: int64
  dlc: int64
  data: object
  type: object

Sample values for each column:
  timestamp: 6.435292 (type: float64)
  arbitration_id: 218068007 (type: int64)
  dlc: 8 (type: int64)
  data: fc ff ff ff 03 ff ff ff (type: str)
  type: R (type: str)
Raw type values: ['R' 'T']
Anomalies in 'type' column: 7812
False


In [None]:
print(processeddataframe['features'].head(2))
print("------------------------------")
print(processeddataframe_test['features'].head(2))
print("------------------------------")
#only_attack_df = processeddataframe_test[processeddataframe_test['type'] == 1]
#only_attack_df = only_attack_df['features']
print(only_attack_df.head(2))

0                                                                       [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0.9882352941176471, 1.0, 1.0, 1.0, 0.011764705882352941, 1.0, 1.0, 1.0, 0.047619047619047616, 0.0, 0.353759374819711, 0.0, 0.0]
1    [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0.047058823529411764, 0.9529411764705882, 0.027450980392156862, 1.0, 0.25098039215686274, 1.0, 1.0, 0.8117647058823529, 0.047619047619047616, 0.0, 0.8018796874098554, 0.0, 6.792544799433253e-06]
Name: features, dtype: object
------------------------------
0                                                                       [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0.9882352941176471, 1.0, 1.0, 1.0, 0.011764705882352941, 1.0, 1.0, 1.0, 0.023809523809523808, 0.0, 0.353759374819711, 0.0, 0.0]
1    [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 

**Mean and STD Between Normal and Attack Dataset**

In [46]:

# Convert 'features' column to DataFrame
normal_features_df = pd.DataFrame(processeddataframe['features'].tolist())
attack_features_df = pd.DataFrame(processeddataframe_test['features'].tolist())

# Compute mean difference
mean_diff = normal_features_df.mean() - attack_features_df.mean()

# Compute standard deviation difference
std_diff = normal_features_df.std() - attack_features_df.std()

# Create a DataFrame with mean and std differences
diff_df = pd.DataFrame({
    'Mean Difference': mean_diff,
    'Std Difference': std_diff
})

# Compute absolute mean difference and sort by it
diff_df['Abs Mean Difference'] = diff_df['Mean Difference'].abs()
diff_df = diff_df.sort_values(by='Abs Mean Difference', ascending=False)

# Print results
print(diff_df)
print("\nFeature Index Mapping:")
print("0-28:  Arbitration ID")
print("29-36: Payload")
print("37:  Message Frequency")
print("38:  IaT")
print("39:  Payload Entropy")
print("40:  Hamming Distance")
print("41:  Timestamp")

    Mean Difference  Std Difference  Abs Mean Difference
13         0.081325        0.023871             0.081325
10         0.056200       -0.020340             0.056200
0          0.055250        0.013044             0.055250
15         0.051550       -0.001009             0.051550
12         0.051325       -0.000816             0.051325
31         0.050651       -0.009890             0.050651
23        -0.048325       -0.012046             0.048325
2         -0.047825        0.014771             0.047825
11         0.047400       -0.012843             0.047400
9          0.047375       -0.012835             0.047375
25         0.047225        0.015336             0.047225
36         0.039600        0.001375             0.039600
33         0.039537       -0.017090             0.039537
37         0.039206       -0.035456             0.039206
17         0.031550       -0.005631             0.031550
26        -0.029400       -0.002147             0.029400
39        -0.027200        0.00

In [47]:
# Convert 'features' column to DataFrame
normal_features_df = pd.DataFrame(processeddataframe['features'].tolist())
attack_features_df = pd.DataFrame(processeddataframe_test['features'].tolist())

#attack_features_df = pd.DataFrame(only_attack_df.tolist())

# Define window size
window_size = 200
num_windows = min(len(normal_features_df) // window_size, len(attack_features_df) // window_size)

# Store results for each window
window_results = []

for i in range(num_windows):
    # Select windowed data
    normal_window = normal_features_df.iloc[i * window_size: (i + 1) * window_size]
    attack_window = attack_features_df.iloc[i * window_size: (i + 1) * window_size]

    # Compute mean difference for the window
    mean_diff = normal_window.mean() - attack_window.mean()

    # Store results
    window_results.append(mean_diff)

# Convert results to DataFrame
window_diff_df = pd.DataFrame(window_results)

# Compute overall statistics (mean and standard deviation of differences across windows)
final_diff_df = pd.DataFrame({
    'Mean Difference': window_diff_df.mean(),
    'Std of Difference': window_diff_df.std()
})

# Compute absolute mean difference and sort
final_diff_df['Abs Mean Difference'] = final_diff_df['Mean Difference'].abs()
final_diff_df = final_diff_df.sort_values(by='Abs Mean Difference', ascending=False)

# Print results
print(final_diff_df)
print("\nFeature Index Mapping:")
print("0-28:  Arbitration ID")
print("29-36: Payload")
print("37:  Message Frequency")
print("38:  IaT")
print("39:  Payload Entropy")
print("40:  Hamming Distance")
print("41:  Timestamp")


    Mean Difference  Std of Difference  Abs Mean Difference
13         0.081325           0.027603             0.081325
10         0.056200           0.019174             0.056200
0          0.055250           0.024921             0.055250
15         0.051550           0.022586             0.051550
12         0.051325           0.018180             0.051325
31         0.050651           0.017634             0.050651
23        -0.048325           0.020344             0.048325
2         -0.047825           0.024677             0.047825
11         0.047400           0.024263             0.047400
9          0.047375           0.020993             0.047375
25         0.047225           0.020841             0.047225
36         0.039600           0.020629             0.039600
33         0.039537           0.015076             0.039537
37         0.039206           0.082649             0.039206
17         0.031550           0.018324             0.031550
26        -0.029400           0.019071  