# Feature Engineering

## Imports & Setup
This notebook imports all necessary libraries and modules using `from utils.imports import *`, which centralizes all dependencies required for training. See `utils/imports.py` for full details.


In [None]:
# Imports from utils.imports
from utils.imports import pd, TfidfVectorizer, nltk, stopwords

In [None]:
# Creating a dictionary. This variable will store the accuracy scores of the different models as a library. {} represents an empty library.
# eg, when logistic regression is used, "model_accuracy['Logistic Regression'] = 92.45"  adds it as an entry to the dictionary

model_accuracy = {}

In [None]:
# Feature creation

## Add a new column 'Payload_Length' that contains the length of each payload
df3['Payload_Length'] = df3['Payload_Cleaned'].apply(len)

## Display the first few rows to confirm the new column is added
print(df3.head(5))


                                             Payload  SQLInjection  XSS  \
0  ghjv9ef1y69cd6i59ihp6u3rsihkkx4z40nkyoqsdam1iq...             1    0   
1  -5420'   union all select 2508 2508 2508 2508 ...             1    0   
2  -2857%'       union all select 7167 7167 7167 ...             1    0   
3  ssssssssssssssssssssssssssssssssssssssssssssss...             1    0   
4  j95utpnafk32s451w4kxzhahkqzs98irp97aesd5n68axy...             1    0   

   Normal                                     Payload_Tokens  \
0       0  [ghjv9ef1y69cd6i59ihp6u3rsihkkx4z40nkyoqsdam1i...   
1       0  [-5420, ', union, all, select, 2508, 2508, 250...   
2       0  [-2857, %, ', union, all, select, 7167, 7167, ...   
3       0  [sssssssssssssssssssssssssssssssssssssssssssss...   
4       0  [j95utpnafk32s451w4kxzhahkqzs98irp97aesd5n68ax...   

                                     Payload_Cleaned  Payload_Length  
0  [ghjv9ef1y69cd6i59ihp6u3rsihkkx4z40nkyoqsdam1i...              18  
1  [-5420, ', union, s

In [None]:
# Feature selection

# Removing too short payloads less than 3 characters that may lack meaningful impact.
df3 = df3[df3['Payload_Length'] > 3]

In [None]:
# Find the longest and shortest payload lengths
longest_length = df3['Payload_Length'].max()
shortest_length = df3['Payload_Length'].min()

# Display the results
print(f"The longest payload length is: {longest_length}")
print(f"The shortest payload length is: {shortest_length}")


The longest payload length is: 269
The shortest payload length is: 4


In [None]:
# Feature Extraction

# Vectorization (TF-IDF: Helps prioritize unique terms)
# Initialize TfidfVectorizer

# This step is needed to prevent the model from been hard coded to just 10500, else when the custom model is used, it will always
# expect 10500 features as input for X (ie the fitted vectorized input/independent value).

# Dynamically calculates max_features based on sample size
sample_size = df1.shape[0]  # Get the number of rows in df1
max_features = min(10500, sample_size)  # Limit max_features to sample size or 10500, whichever is smaller

tfidf_vectorizer = TfidfVectorizer(
   stop_words='english',  # This can also be replaced with custom stop_words when necessary
   max_features=max_features,  # Use the calculated max_features
   ngram_range=(1, 2),   # Unigrams and bigrams
   min_df=2,             # Ignore terms with low frequency
   max_df=0.9            # Ignore terms with very high frequency
)

# Note: The filtering parameters (min_df and max_df) are restrictive for small (simulated) dataset as tfidf_vectorizer is initialized with min_df=2 and max_df=0.9, meaning for
# min_df=2: A term must appear in at least 2 documents (or eg web requests like url, query, method input or in Payload) to be included in the vocabulary (vectorization).
# max_df=0.9: A term that appears in more than 90% of the documents is excluded. If this is not met, the model
# will flag an error otherwise, min_df can be changed to 1. In the case of a large dataset, this will result to a very high-dimensional feature space, potentially containing many irrelevant or very rare terms, noise, overfitting, etc