# PCA with Pytorch

In [None]:
import pandas as pd
import torch



# read the whole training dataset
filePath_training = "./clean_datasets/new_split/training_dataset.csv"
training_dataset = pd.read_csv(filePath_training)

# drop the labels
X_training = training_dataset.drop(['attack_cat', "label"], axis=1)


# convert to tensor
X_training_tensor = torch.from_numpy(X_training.to_numpy())

# caclulate the mean of the dataset for each features (column-wise)
X_mean = torch.mean(X_training_tensor, dim=0)

# center the matrix
X_centered = X_training_tensor - X_mean

# number of data points:
N = X_centered.shape[0]

# compute the covariance matrix
covariance_matrix = (1/N)*(torch.mm(X_centered.t(), X_centered))

S, V = torch.linalg.eig(covariance_matrix)

S = S.real

# analyze the Egien values and drop the values that are less than 1 percent of the whole sum
sum_singular_values = torch.sum(S)
percentages_singular_values = 100*S / sum_singular_values
idx_features_to_keep = percentages_singular_values > 1

print(idx_features_to_keep)
print("\nFeatures to keep: ")
print(X_training.columns[idx_features_to_keep])

print("\nFeatures to drop: ")
print(X_training.columns[idx_features_to_keep==False])

print("\nPercentage of the sum of Eigen values that is kept is: "
      + str(sum(percentages_singular_values[idx_features_to_keep]).item()))


In [None]:

import pandas as pd
import torch



# read the whole training dataset
filePath_training = "./clean_datasets/new_split/training_dataset.csv"
training_dataset = pd.read_csv(filePath_training)

# drop the labels
X_training = training_dataset.drop(['attack_cat', "label"], axis=1)

# convert to tensor
X_training_tensor = torch.from_numpy(X_training.to_numpy())

# caclulate the mean of the dataset for each features (column-wise)
X_mean = torch.mean(X_training_tensor, dim=0)

# define the number of features in the datset
d_features = int(len(X_mean))

# center the matrix
X_centered = X_training_tensor - X_mean

# apply PCA. The function is expecting the matrix of the dataset (not the covariance)
U, S, V = torch.pca_lowrank(X_centered, q=d_features)

# analyze the Egien values and drop the values that are less than 1 percent of the whole sum
sum_singular_values = torch.sum(S)
percentages_singular_values = 100*S / sum_singular_values
idx_features_to_keep = percentages_singular_values > 1

print(idx_features_to_keep)
print("\nFeatures to keep: ")
print(X_training.columns[idx_features_to_keep])

print("\nFeatures to drop: ")
print(X_training.columns[idx_features_to_keep==False])

print("\nPercentage of the sum of Eigen values that is kept is: "
      + str(sum(percentages_singular_values[idx_features_to_keep]).item()))



tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False])

Features to keep: 
Index(['dur', 'proto', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'sttl',
       'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit',
       'djit', 'swin', 'stcpb'],
      dtype='object')

Features to drop: 
Index(['dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean',
       'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl',
       'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst',
       'is_sm_ips_ports'],
      dtype='object')

Percentage of the sum of Eigen values that is kept is: 91.91954211577877


# PCA with scikit-learn


In [None]:

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

# read the whole training dataset
filePath_training = "./clean_datasets/new_split/training_dataset.csv"
training_dataset = pd.read_csv(filePath_training)

# drop the labels
X_training = training_dataset.drop(['attack_cat', "label"], axis=1)
# convert to numpy
X_training_numpy = X_training.to_numpy()

# apply PCA
pca = PCA(n_components=X_training_numpy.shape[1])
pca.fit(X_training_numpy)

# singular values
singular_values = pca.singular_values_

# analyze the Egien values and drop the values that are less than 1 percent of the whole sum
sum_singular_values = sum(singular_values)
percentages_singular_values = 100*singular_values / sum_singular_values
idx_features_to_keep = percentages_singular_values > 1

print(idx_features_to_keep)
print("\nFeatures to keep: ")
print(X_training.columns[idx_features_to_keep])

print("\nFeatures to drop: ")
print(X_training.columns[idx_features_to_keep==False])

print("\nPercentage of the sum of Eigen values that is kept is: "
      + str(sum(percentages_singular_values[idx_features_to_keep])))





[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True False False False False False
 False False False False False False False False False False False False
 False False False]

Features to keep: 
Index(['dur', 'proto', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'sttl',
       'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit',
       'djit', 'swin', 'stcpb'],
      dtype='object')

Features to drop: 
Index(['dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean',
       'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl',
       'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst',
       'is_sm_ips_ports'],
      dtype='object')

Percentage of the sum of Eigen values that is kept is: 91.9195421157784
