# Model Training


## Imports & Setup
This notebook imports all necessary libraries and modules using `from utils.imports import *`, which centralizes all dependencies required for training. See `utils/imports.py` for full details.


In [None]:
# Imports from utils/imports.py
from utils.imports import (
    pd, np, os,
    train_test_split, SVC,
    accuracy_score, confusion_matrix, ConfusionMatrixDisplay,
    plt, sns
    )

### Support Vector Machine (SVM)

In [None]:
# Train the SVM Classifier
svm_model = SVC(kernel='linear', random_state=42)    # In stack ensemblling, probability=True has to be added to ensure compatibility with predict_proba as it requires it to be true instead of the default false. But because it takes so long to train with this probability set to true, it will be excluded in the ensemble proba.
svm_model.fit(X_train, y_train)

# Step 5: Make Predictions
y_pred_svm = svm_model.predict(X_test)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make Predictions
y_pred_rf = rf_model.predict(X_test)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression Classifier
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)

# Make Predictions
y_pred_lr = lr_model.predict(X_test)

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Train Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Make Predictions
y_pred_gb = gb_model.predict(X_test)

In [None]:
# Gets the original indices of the test set from before the TF-IDF transformation
# else the X-text and y_pred will have uneven no of rows
test_indices = y_test.reset_index().index  # y_test still has the original indices

# Ensures that predictions are added to the right rows in df3 using the original indices
df3.loc[test_indices, 'y_pred_gb'] = y_pred_gb

# Verify by printing the first few rows of df3
print(df3.head())

                                             Payload  SQLInjection  XSS  \
0  ghjv9ef1y69cd6i59ihp6u3rsihkkx4z40nkyoqsdam1iq...             1    0   
1  -5420'   union all select 2508 2508 2508 2508 ...             1    0   
2  -2857%'       union all select 7167 7167 7167 ...             1    0   
3  ssssssssssssssssssssssssssssssssssssssssssssss...             1    0   
4  j95utpnafk32s451w4kxzhahkqzs98irp97aesd5n68axy...             1    0   

   Normal                                     Payload_Tokens  \
0       0  [ghjv9ef1y69cd6i59ihp6u3rsihkkx4z40nkyoqsdam1i...   
1       0  [-5420, ', union, all, select, 2508, 2508, 250...   
2       0  [-2857, %, ', union, all, select, 7167, 7167, ...   
3       0  [sssssssssssssssssssssssssssssssssssssssssssss...   
4       0  [j95utpnafk32s451w4kxzhahkqzs98irp97aesd5n68ax...   

                                     Payload_Cleaned  Payload_Length  Label  \
0  ghjv9ef1y69cd6i59ihp6u3rsihkkx4z40nkyoqsdam1iq...              18      0   
1  -54

### Multi-layer Perceptron (MLP) Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

# Train Multi-layer Perceptron (MLP) Neural Network Classifier
nn_model = MLPClassifier(random_state=42, max_iter=1000)
nn_model.fit(X_train, y_train)

# Make Predictions
y_pred_nn = nn_model.predict(X_test)

In [None]:
#### Gets the original indices of the test set from before the TF-IDF transformation
# else the X-text and y_pred will have uneven no of rows
test_indices = y_test.reset_index().index  # y_test still has the original indices

# Ensures that predictions are added to the right rows in df3 using the original indices
df3.loc[test_indices, 'y_pred_nn'] = y_pred_nn

# Verify by printing the first few rows of df3
print(df3.head())

                                             Payload  SQLInjection  XSS  \
0  ghjv9ef1y69cd6i59ihp6u3rsihkkx4z40nkyoqsdam1iq...             1    0   
1  -5420'   union all select 2508 2508 2508 2508 ...             1    0   
2  -2857%'       union all select 7167 7167 7167 ...             1    0   
3  ssssssssssssssssssssssssssssssssssssssssssssss...             1    0   
4  j95utpnafk32s451w4kxzhahkqzs98irp97aesd5n68axy...             1    0   

   Normal                                     Payload_Tokens  \
0       0  [ghjv9ef1y69cd6i59ihp6u3rsihkkx4z40nkyoqsdam1i...   
1       0  [-5420, ', union, all, select, 2508, 2508, 250...   
2       0  [-2857, %, ', union, all, select, 7167, 7167, ...   
3       0  [sssssssssssssssssssssssssssssssssssssssssssss...   
4       0  [j95utpnafk32s451w4kxzhahkqzs98irp97aesd5n68ax...   

                                     Payload_Cleaned  Payload_Length  Label  \
0  ghjv9ef1y69cd6i59ihp6u3rsihkkx4z40nkyoqsdam1iq...              18      0   
1  -54

### Extra Trees (Extremely Randomized Trees)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

# Train ExtraTrees Classifier
et_model = ExtraTreesClassifier(random_state=42)
et_model.fit(X_train, y_train)

# Make Predictions
y_pred_et = et_model.predict(X_test)

#### Mapping Predictions to DataFrame

In [None]:
#### Gets the original indices of the test set from before the TF-IDF transformation
# else the X-text and y_pred will have uneven no of rows
test_indices = y_test.reset_index().index  # y_test still has the original indices

# Ensures that predictions are added to the right rows in df3 using the original indices
df3.loc[test_indices, 'y_pred_et'] = y_pred_et

# Verify by printing the first few rows of df3
print(df3.head())

                                             Payload  SQLInjection  XSS  \
0  ghjv9ef1y69cd6i59ihp6u3rsihkkx4z40nkyoqsdam1iq...             1    0   
1  -5420'   union all select 2508 2508 2508 2508 ...             1    0   
2  -2857%'       union all select 7167 7167 7167 ...             1    0   
3  ssssssssssssssssssssssssssssssssssssssssssssss...             1    0   
4  j95utpnafk32s451w4kxzhahkqzs98irp97aesd5n68axy...             1    0   

   Normal                                     Payload_Tokens  \
0       0  [ghjv9ef1y69cd6i59ihp6u3rsihkkx4z40nkyoqsdam1i...   
1       0  [-5420, ', union, all, select, 2508, 2508, 250...   
2       0  [-2857, %, ', union, all, select, 7167, 7167, ...   
3       0  [sssssssssssssssssssssssssssssssssssssssssssss...   
4       0  [j95utpnafk32s451w4kxzhahkqzs98irp97aesd5n68ax...   

                                     Payload_Cleaned  Payload_Length  Label  \
0  ghjv9ef1y69cd6i59ihp6u3rsihkkx4z40nkyoqsdam1iq...              18      0   
1  -54