## Testing the Preprocessor Class

In this notebook, we will be testing the `Preprocessor` class to ensure it functions correctly with our dataset. The `Preprocessor` class is designed to preprocess data and prepare it for model prediction. Here are the steps we will follow:

1. **Load the Dataset**: We will start by loading a sample of the UNSW_NB15 training set.
2. **Sample Selection**: We will select 10 random examples from the training set to use as our test samples.
3. **Data Cleaning**: We will remove unnecessary columns such as 'attack_cat' and 'label' from our test samples.
4. **Save Sample Data**: The cleaned sample data will be saved to a parquet file for further processing.
5. **Initialize Preprocessor**: We will create an instance of the `Preprocessor` class using the sample data and a pre-trained model.
6. **Preprocess Data**: The sample data will be preprocessed using the `preprocess` method of the `Preprocessor` class.



In [13]:
import pandas as pd
import sys
sys.path.append('../../')
import src.data.UNSW_NB15_preprocessor.Preprocessor as prep
import warnings

warnings.filterwarnings("ignore")
# Get test samples from UNSW_NB15_training-set.parquet
df = pd.read_parquet('../../data/UNSW_NB15_data/UNSW_NB15_training-set.parquet')

# Select 10 random examples from the training set
df_sample = df.sample(n=10, random_state=17)  

# Remove 'attack_cat' and 'label' columns
df_sample = df_sample.drop(columns=['attack_cat', 'label'], errors='ignore')

# Save to parquet file
df_sample.to_parquet('10_samples.parquet', index=False)

In [2]:
# Display the samples
df_sample.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,smean,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports
109275,1e-05,unas,-,INT,2,0,200,0,100000.0,80000000.0,...,100,0,0,0,3,3,0,0,0,0
160022,0.435792,tcp,-,FIN,10,6,2516,268,34.42009,41579.47,...,252,45,0,0,2,1,0,0,0,0
28600,1.004934,tcp,http,FIN,12,18,1580,10168,28.857618,11535.09,...,132,565,1,0,1,1,0,0,1,0
121557,58.899662,ospf,-,REQ,58,0,6264,0,0.967747,836.1339,...,108,0,0,0,1,1,0,0,0,0
16055,0.02142,tcp,smtp,FIN,52,42,37268,3380,4341.736816,13651540.0,...,717,80,0,0,2,1,0,0,0,0


In [3]:
# Select 10 rows with attack_cat is 'Generic' or 'Exploits' or 'Fuzzers' or 'DoS' or 'Reconnaissance' from df
df_sample = df.loc[df['attack_cat'].isin(['Generic', 'Exploits', 'Fuzzers', 'DoS', 'Reconnaissance'])].sample(n=10, random_state=17)

# Remove 'attack_cat' and 'label' columns
df_sample = df_sample.drop(columns=['attack_cat', 'label'], errors='ignore')

# Save to parquet file
df_sample.to_parquet('10_attack_cat_samples.parquet', index=False)

In [4]:
df_sample.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,smean,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports
163393,5e-06,udp,dns,INT,2,0,114,0,200000.0,91200000.0,...,57,0,0,0,14,14,0,0,0,0
120546,1.690659,tcp,-,FIN,60,16,68199,698,44.361401,317334.2,...,1137,44,1,0,1,1,0,0,1,0
155665,8e-06,udp,dns,INT,2,0,114,0,125000.0,57000000.0,...,57,0,0,0,18,18,0,0,0,0
99697,4e-06,iso-ip,-,INT,2,0,200,0,250000.0,200000000.0,...,100,0,0,0,2,2,0,0,0,0
139834,8e-06,udp,dns,INT,2,0,114,0,125000.0,57000000.0,...,57,0,0,0,11,11,0,0,0,0


In [21]:
# Paths to the data and model files
detection_data_path = "10_samples.parquet"
classficication_data_path = "10_attack_cat_samples.parquet"
detection_data_path = "../../models/UNSW_NB15_models/catboost_detection_model_94.5_Recall.cbm"
classification_data_path = "../../models/UNSW_NB15_models/catboost_classification_model_83_f1.cbm"

In [None]:
# Class Test
preprocessor = prep.Preprocessor(detection_data_path)

In [14]:
# Load the data
df_sample = pd.read_parquet(detection_data_path)
# Preprocess the data
df_sample_preprocessed = preprocessor.preprocess(df_sample)

In [15]:
# Display the preprocessed data
df_sample_preprocessed.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,djit,tcprtt,smean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_flw_http_mthd,is_sm_ips_ports
0,1e-05,unas,-,INT,2,0,5.303305,0.0,100000.0,18.197536,...,0.0,0.0,4.61512,0,0,3,3,0,0,0
1,0.435792,tcp,-,FIN,10,6,7.830823,5.594711,34.42009,10.635386,...,4.652568,0.110482,5.53339,0,0,2,1,0,0,0
2,1.004934,tcp,http,FIN,12,18,7.365813,9.227099,28.857618,9.353235,...,0.0,0.000633,4.890349,1,0,1,1,0,1,0
3,58.899662,ospf,-,REQ,58,0,8.742734,0.0,0.967747,6.729984,...,0.0,0.0,4.691348,0,0,1,1,0,0,0
4,0.02142,tcp,smtp,FIN,52,42,10.525917,8.125927,4341.736816,16.429363,...,0.637382,0.000671,6.576469,0,0,2,1,0,0,0


In [22]:
# Class Test
preprocessor = prep.Preprocessor(classification_data_path)

In [23]:
# Load the data 
df_sample = pd.read_parquet(classficication_data_path)

# Preprocess the data
df_sample_preprocessed = preprocessor.preprocess(df_sample)

In [24]:
# Display the preprocessed data
df_sample_preprocessed.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,sload,sloss,...,is_ftp_login,ct_flw_http_mthd,Speed of Operations to Speed of Data Bytes,Time for a Single Process,Ratio of Data Flow,Ratio of Packet Flow,Total Page Errors,Network Usage,Network Activity Rate,Network Latency
0,5e-06,udp,dns,INT,2,0,4.744932,0.0,18.328566,0,...,0,0,27.762049,2e-06,0.0,0.0,0.0,4.744932,1.098612,1.098612
1,1.690659,tcp,-,FIN,60,16,11.130199,6.549651,12.667714,27,...,0,1,4.592149,0.027788,0.010183,0.236389,3.842626,11.140382,4.343805,3.806663
2,8e-06,udp,dns,INT,2,0,4.744932,0.0,17.858562,0,...,0,0,27.762049,4e-06,0.0,0.0,0.0,4.744932,1.098612,1.098612
3,4e-06,-,-,INT,2,0,5.303305,0.0,19.113829,0,...,0,0,28.324168,2e-06,0.0,0.0,0.0,5.303305,1.098612,1.098612
4,8e-06,udp,dns,INT,2,0,4.744932,0.0,17.858562,0,...,0,0,27.762049,4e-06,0.0,0.0,0.0,4.744932,1.098612,1.098612
