In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD

# Configure matplotlib for better visualization
plt.style.use('seaborn-v0_8')

In [None]:

# Configure matplotlib for better visualization
plt.style.use('seaborn-v0_8')

# Load the dataset
data = pd.read_csv("/home/belief/Desktop/MalwareDetection/PASAD/PASAD_DATA/pasad/data/1 - Scenario DA1/xmv10_359_data_1.csv")

# Display the first few rows of the dataset to understand the structure
print(data.head())
print(data.shape)


In [None]:

# Define PASAD parameters
N = 800  # Number of initial samples for training
L_values = [5, 10, 20, 24, 30, 48, 100, 150, 200]  # Different L values to test

# Split the data into training and testing sets
train_data = data.iloc[1600:2400, 1].values  # Using measurements 1600 to 2400 for training
test_data = data.iloc[2400:, 1].values       # Using remaining for testing

print(f"Training data length: {len(train_data)}")
print(f"Testing data length: {len(test_data)}")


In [4]:

# Function to identify the elbow point for r selection
def find_elbow(singular_values):
    diffs = np.diff(singular_values)  # Calculate the difference between consecutive singular values
    elbow_point = np.argmin(diffs) + 1  # The index where the difference is minimized + 1 for 1-based indexing
    return elbow_point


In [None]:

# Loop through different values of L
for L in L_values:
    # Create the trajectory matrix for training data
    K = N - L + 1  # Number of lagged vectors
    trajectory_matrix = np.zeros((L, K))

    # Construct the lagged vectors for the trajectory matrix
    for i in range(K):
        trajectory_matrix[:, i] = train_data[i:i + L]

    # Display the shape of the trajectory matrix
    print(f"Trajectory matrix shape for L={L}: {trajectory_matrix.shape}")

    # Perform Singular Value Decomposition on the trajectory matrix
    U, Sigma, VT = np.linalg.svd(trajectory_matrix, full_matrices=False)

    # Determine r using the elbow point detection method
    r = find_elbow(Sigma)

    # Plot the significance of each component (Scree plot)
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(Sigma) + 1), Sigma, 'o-', label=f'Singular Values for L={L}')
    plt.axvline(x=r, color='red', linestyle='--', label=f'Selected r={r}')
    plt.title(f'Scree Plot - Significance of Each Component (L={L})')
    plt.xlabel('Component Index (r)')
    plt.ylabel('Singular Value (Significance)')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Use the top 'r' components based on singular values
    U_r = U[:, :r]

    # Display the shape of U and reduced U_r
    print(f"Shape of U: {U.shape}")
    print(f"Shape of reduced U_r for L={L}: {U_r.shape}")

    # Project the lagged training vectors onto the signal subspace
    projected_train_vectors = U_r.T @ trajectory_matrix

    # Compute the clusterMean of the projected vectors
    clusterMean = np.mean(projected_train_vectors, axis=1)

    # Display the clusterMean
    print(f"clusterMean of projected training vectors for L={L}: {clusterMean}")

    # Function to calculate departure score for a new test vector
    def compute_departure_score(test_vector, U_r, clusterMean):
        # Project the test vector onto the signal subspace
        projected_test_vector = U_r.T @ test_vector

        # Compute the Euclidean distance from the clusterMean
        departure_score = np.linalg.norm(projected_test_vector - clusterMean)**2
        return departure_score

    # Create test vectors and compute departure scores
    test_departure_scores = []
    for i in range(len(test_data) - L + 1):
        test_vector = test_data[i:i + L]
        score = compute_departure_score(test_vector, U_r, clusterMean)
        test_departure_scores.append(score)

    # Convert scores to a numpy array for easier plotting
    test_departure_scores = np.array(test_departure_scores)


    # Plot the raw test data
    plt.figure(figsize=(14, 6))
    plt.subplot(2, 1, 1)
    plt.plot(test_data, label='Raw Test Data')
    plt.title(f'Raw Test Data (L={L})')
    plt.xlabel('Time Index')
    plt.ylabel('Value')
    plt.legend()

    # Plot the departure scores
    plt.subplot(2, 1, 2)
    plt.plot(test_departure_scores, label='Departure Scores', color='red')
    plt.axhline(y=np.mean(test_departure_scores) + 2 * np.std(test_departure_scores), color='green', linestyle='--', label='Threshold')
    plt.title(f'Departure Scores (L={L})')
    plt.xlabel('Time Index')
    plt.ylabel('Departure Score')
    plt.legend()
    plt.tight_layout()
    plt.show()
