# Implementation of the deployment of Stacker6X and test with a simulated data

## Imports & Setup
This notebook imports all necessary libraries and modules using `from utils.imports import *`, which centralizes all dependencies required for deployment simulation. See `utils/imports.py` for full details.


In [None]:
# Imports from utils/imports.py
from utils.imports import pd, np, os, joblib, nltk, TfidfVectorizer

In [None]:
import joblib

# Saving the model locally; it creates a file named Stacker6X_trained_model.pkl (ie instance/object of the Stacker6X fusion_model created from the Stacker6X class) in the working directory.
joblib.dump(fusion_model, '/content/drive/MyDrive/Colab Notebooks/Stacker6X_trained_model.pkl')

['/content/drive/MyDrive/Colab Notebooks/Stacker6X_trained_model.pkl']

In [None]:
import os

# Checking if the file is saved
print(os.path.exists('/content/drive/MyDrive/Colab Notebooks/Stacker6X_trained_model.pkl'))  # Will return True if the file is in the current directory


True


In [None]:
# Loading the trained ensemble model
stacker6X_model_instance = joblib.load('/content/drive/MyDrive/Colab Notebooks/Stacker6X_trained_model.pkl')


In [None]:
# Seed for reproducibility
np.random.seed(42)

# Simulate data
n_samples = 7752

# Generating synthetic numeric features that are descriptions of the payload (not the payload) and generates targets features (class randomly assigned).

data = {
    'input_length': np.random.randint(20, 300, size=n_samples),  # Length of input
    'num_special_chars': np.random.randint(0, 50, size=n_samples),  # Special characters
    'num_keywords': np.random.randint(0, 10, size=n_samples),  # Keywords like SELECT, DROP, <script>
    'has_script_tag': np.random.choice([0, 1], size=n_samples),  # Presence of "<script>"
    'num_digits': np.random.randint(0, 30, size=n_samples),  # Number of digits in input
    'input_entropy': np.random.uniform(0.5, 4.0, size=n_samples),  # Entropy of input
}

# Adding target column
target_classes = ['SQLI', 'XSS', 'Normal']  # Target categories
data['target'] = np.random.choice(target_classes, size=n_samples, p=[0.3, 0.3, 0.4])  # Probabilities for classes

# Create DataFrame
df_simulated = pd.DataFrame(data)

# Display first few rows of the simulated dataset
print(df_simulated.head())


   input_length  num_special_chars  num_keywords  has_script_tag  num_digits  \
0           122                 38             2               1          17   
1           290                 27             1               1          28   
2           126                 29             7               0          11   
3            91                 36             5               1           2   
4           208                  2             6               1          20   

   input_entropy  target  
0       3.317920     XSS  
1       3.460554     XSS  
2       2.734520  Normal  
3       3.194492    SQLI  
4       1.101423    SQLI  


In [None]:
# To confirm the counts of 'SQLI', 'XSS', and 'Normal' are in the dataset
print(df_simulated['target'].value_counts())

target
Normal    3053
XSS       2354
SQLI      2345
Name: count, dtype: int64


In [None]:
# Defines a function to generate synthetic payload strings based on the assigned target class for simulation purposes (Mapping target classes to example payload texts (generated content)

def generate_payload(target):
   """Generates a hardcoded text payload based on the target class.

    This function takes each value in the 'target' column of a DataFrame
    and returns a specific string ('SQL Injection payload', 'XSS payload',
    or 'Normal message') based on the target class.
    The returned strings are then assigned to a new 'payload' column.
    """
    if target == 'SQLI':
        return "SELECT * FROM users WHERE id='1' OR '1'='1'; --"
    elif target == 'XSS':
        return "<script>alert('XSS')</script>"
    else:
        return "Hello, I need help with my account."

# Apply payload generation based on the 'target' class
df_simulated['payload'] = df_simulated['target'].apply(generate_payload)


# To also view the 'SQLI', 'XSS' and 'Normal' Payloads in the dataset
print(df_simulated)

      input_length  num_special_chars  num_keywords  has_script_tag  \
0              122                 38             2               1   
1              290                 27             1               1   
2              126                 29             7               0   
3               91                 36             5               1   
4              208                  2             6               1   
...            ...                ...           ...             ...   
7747            77                  8             0               1   
7748           124                 48             7               0   
7749           285                 17             8               1   
7750            29                 42             6               0   
7751           272                 11             2               0   

      num_digits  input_entropy  target  \
0             17       3.317920     XSS   
1             28       3.460554     XSS   
2             11  

In [None]:
# Ensuring df_simulated has the payload and target columns from previous steps
# df_simulated is  used for the prediction process.
class_to_numeric = {"SQLI": 0, "XSS": 1, "Normal": 2}
df_simulated['numerical_label'] = df_simulated['target'].map(class_to_numeric)

display(df_simulated.head())

Unnamed: 0,input_length,num_special_chars,num_keywords,has_script_tag,num_digits,input_entropy,target,payload,numerical_label
0,122,38,2,1,17,3.31792,XSS,<script>alert('XSS')</script>,1
1,290,27,1,1,28,3.460554,XSS,<script>alert('XSS')</script>,1
2,126,29,7,0,11,2.73452,Normal,"Hello, I need help with my account.",2
3,91,36,5,1,2,3.194492,SQLI,SELECT * FROM users WHERE id='1' OR '1'='1'; --,0
4,208,2,6,1,20,1.101423,SQLI,SELECT * FROM users WHERE id='1' OR '1'='1'; --,0


In [None]:
import joblib

# Load the fitted TF-IDF vectorizer
loaded_tfidf_vectorizer = joblib.load('/content/drive/MyDrive/Colab Notebooks/tfidf_vectorizer.pkl')

# loaded_tfidf_vectorizer can now be used to transform new data
print("TF-IDF vectorizer loaded successfully.")

TF-IDF vectorizer loaded successfully.


In [None]:
import os

# Checking if the file is saved
print(os.path.exists('/content/drive/MyDrive/Colab Notebooks/tfidf_vectorizer.pkl'))  # Will return True if the file is in the current directory

True


In [None]:
# Using the loaded vectorizer to transform the 'payload' column of the simulated data (df_simulated)
# This prepares the simulated text data for input into the loaded Stacker6X model.
X_simulated_vectorized = loaded_tfidf_vectorizer.transform(df_simulated['payload'])

print(f"Shape of vectorized simulated data: {X_simulated_vectorized.shape}")

Shape of vectorized simulated data: (7752, 7752)


In [None]:
# Using the loaded Stacker6X model instance to make predictions on the vectorized simulated data
predictions_on_simulated_data_numeric = stacker6X_model_instance.predict(X_simulated_vectorized)

print("Predictions on simulated data (numeric):")
print(predictions_on_simulated_data_numeric[:10]) # Print first 10 predictions

Predictions on simulated data (numeric):
[1 1 0 0 0 0 0 1 0 0]


In [None]:
# Map the numerical predictions back to class names (0: SQLInjection, 1: XSS, 2: Normal)
class_mapping = {0: "SQLInjection", 1: "XSS", 2: "Normal"}
predicted_labels_simulated_data = [class_mapping[pred] for pred in predictions_on_simulated_data_numeric]

# Add these predicted labels back to the df_simulated DataFrame
df_simulated['Predicted_Label'] = predicted_labels_simulated_data

# Display the original payload strings, true target and predicted label for comparison
print("Simulated Data with Predictions:")
display(df_simulated[['payload', 'target', 'Predicted_Label']].head())

Simulated Data with Predictions:


Unnamed: 0,payload,target,Predicted_Label
0,<script>alert('XSS')</script>,XSS,XSS
1,<script>alert('XSS')</script>,XSS,XSS
2,"Hello, I need help with my account.",Normal,SQLInjection
3,SELECT * FROM users WHERE id='1' OR '1'='1'; --,SQLI,SQLInjection
4,SELECT * FROM users WHERE id='1' OR '1'='1'; --,SQLI,SQLInjection


In [None]:
df_simulated

Unnamed: 0,input_length,num_special_chars,num_keywords,has_script_tag,num_digits,input_entropy,target,payload,numerical_label,Predicted_Label
0,122,38,2,1,17,3.317920,XSS,<script>alert('XSS')</script>,1,XSS
1,290,27,1,1,28,3.460554,XSS,<script>alert('XSS')</script>,1,XSS
2,126,29,7,0,11,2.734520,Normal,"Hello, I need help with my account.",2,SQLInjection
3,91,36,5,1,2,3.194492,SQLI,SELECT * FROM users WHERE id='1' OR '1'='1'; --,0,SQLInjection
4,208,2,6,1,20,1.101423,SQLI,SELECT * FROM users WHERE id='1' OR '1'='1'; --,0,SQLInjection
...,...,...,...,...,...,...,...,...,...,...
7747,77,8,0,1,17,2.037650,XSS,<script>alert('XSS')</script>,1,XSS
7748,124,48,7,0,1,0.710412,SQLI,SELECT * FROM users WHERE id='1' OR '1'='1'; --,0,SQLInjection
7749,285,17,8,1,18,1.108665,XSS,<script>alert('XSS')</script>,1,XSS
7750,29,42,6,0,23,3.047568,XSS,<script>alert('XSS')</script>,1,XSS
