In [17]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load the new dataset, handling any bad lines, duplicates, and missing data
data = pd.read_csv('modified_xss_dataset.csv', encoding='utf-8', on_bad_lines='skip').dropna().drop_duplicates()

# Concatenate if you have multiple datasets to combine
# In this example, assuming only one dataset, assign it directly
Complete_new_data = data.reset_index(drop=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(Complete_new_data['Query'], Complete_new_data['Label'], test_size=0.3, random_state=42)

import re

def xss_tokenizer(payload):
    tokens = []
    current_token = ''
    
    # Define XSS patterns
    xss_patterns = [
        r'<script[^>]*?>.*?</script>',              # <script> tags
        r'javascript\s*:',                          # "javascript:" syntax
        r'on\w+\s*=',                               # Inline event handlers (e.g., "onclick=")
        r'<img[^>]*?on\w+\s*=\s*["\'].*?["\']',     # <img> tags with event handlers
        r'<iframe[^>]*?>.*?</iframe>',              # <iframe> tags
        r'<svg[^>]*?>.*?</svg>',                    # <svg> tags
        r'<a[^>]*?href\s*=\s*["\']javascript:.*?["\']', # <a> tags with "javascript:" in href
        r'<object.*?>.*?</object>',                 # <object> tags
        r'<embed.*?>.*?</embed>',                   # <embed> tags
        r'<form.*?>.*?</form>',                     # <form> tags
        r'&#[xX]?[0-9a-fA-F]+;',                    # HTML entity encoding
        r'alert\s*\(.*?\)',                         # "alert()" function
        r'<.*?alert\(.+?\);.*?>',                   # "<...alert();...>" syntax
    ]

    # Combine XSS patterns into one regex
    combined_xss_regex = '|'.join(xss_patterns)
    
    # Iterate over each character in payload to create tokens
    i = 0
    while i < len(payload):
        # Check for matches against XSS patterns
        match = re.match(combined_xss_regex, payload[i:])
        if match:
            # If a match is found, add any current token, then add the match
            if current_token:
                tokens.append(current_token)
                current_token = ''
            tokens.append(match.group(0))
            i += len(match.group(0))  # Move past the matched pattern
            continue
        
        # Accumulate regular characters until a special character or match
        char = payload[i]
        if char in ['\'', '\"', ';', '=', '<', '>']:
            # If there's a current token, add it before adding the special character
            if current_token:
                tokens.append(current_token)
                current_token = ''
            tokens.append(char)
        elif char.isspace():
            # If space is encountered, add any current token and reset
            if current_token:
                tokens.append(current_token)
                current_token = ''
        else:
            # Otherwise, add the character to the current token
            current_token += char

        i += 1
    
    # Append the final token if there's any
    if current_token:
        tokens.append(current_token)
    
    return tokens

# Vectorization with custom tokenization and N-grams (1 to 3 grams)
vectorizer = TfidfVectorizer(tokenizer=xss_tokenizer, ngram_range=(1, 3))

# Apply TF-IDF transformation to capture n-grams and token frequencies
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Model training using Logistic Regression
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))





Accuracy: 0.9884075655887736
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98      1084
           2       0.99      1.00      0.99      2194

    accuracy                           0.99      3278
   macro avg       0.99      0.98      0.99      3278
weighted avg       0.99      0.99      0.99      3278



In [18]:
import joblib

# Save the model to a file
joblib.dump(model, 'logistic_regression_xss.pkl')

# Save the vectorizer as well, since it is needed to preprocess new data the same way
joblib.dump(vectorizer, 'tfidf_vectorizer_xss.pkl')

print("Model and vectorizer saved!")


Model and vectorizer saved!


In [16]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score

# Load the saved model and vectorizer
model = joblib.load('logistic_regression_xss.pkl')
vectorizer = joblib.load('tfidf_vectorizer_xss.pkl')

xss_data = {
    'Query': [
        "INSERT INTO comments (username, comment) VALUES ('admin', '<script>alert('Hi')</script>')", # XSS
        "<img src='invalid.jpg' onerror='alert(1)'/>", # XSS
        "<b>Welcome, user!</b>",                      # Safe HTML
        "<iframe src='http://malicious-site.com'></iframe>", # XSS
        "<div onclick='alert(\"Clicked!\")'>Click me</div>", # XSS
        "<p>This is a safe paragraph.</p>",           # Safe HTML
        "<a href='javascript:alert(1)'>Click here</a>", # XSS
        "<input type='text' value='safe input'/>",    # Safe HTML
        "<svg onload='alert(document.cookie)'></svg>", # XSS
        "Hello, welcome to our site!",                # Plain text, safe
    ],
    'Label': [2, 2, 0, 2, 2, 0, 2, 0, 2, 0]
}

# Convert the dictionary into a DataFrame
xss_test_data = pd.DataFrame(xss_data)

# Tokenize and vectorize the queries using the preloaded vectorizer
xss_test_queries = xss_test_data['Query']
xss_test_queries_tfidf = vectorizer.transform(xss_test_queries)

# Apply the model with a probability threshold
probability_threshold = 0.7  # Experiment with this value
xss_predictions = (model.predict_proba(xss_test_queries_tfidf)[:, 1] >= probability_threshold).astype(int)

# Adjust the predictions to match your label scheme (0 for safe, 2 for XSS)
xss_predictions_adjusted = [2 if pred == 1 else 0 for pred in xss_predictions]

# Assign predictions to DataFrame and calculate accuracy
xss_test_data['Prediction'] = xss_predictions_adjusted
xss_accuracy = accuracy_score(xss_test_data['Label'], xss_predictions_adjusted) * 100

print(f"XSS Detection Model Accuracy with Threshold Adjustment: {xss_accuracy:.2f}%")
print(xss_test_data[['Query', 'Label', 'Prediction']])


XSS Detection Model Accuracy with Threshold Adjustment: 60.00%
                                               Query  Label  Prediction
0  INSERT INTO comments (username, comment) VALUE...      2           2
1        <img src='invalid.jpg' onerror='alert(1)'/>      2           0
2                              <b>Welcome, user!</b>      0           0
3  <iframe src='http://malicious-site.com'></iframe>      2           2
4    <div onclick='alert("Clicked!")'>Click me</div>      2           0
5                   <p>This is a safe paragraph.</p>      0           0
6       <a href='javascript:alert(1)'>Click here</a>      2           0
7            <input type='text' value='safe input'/>      0           2
8        <svg onload='alert(document.cookie)'></svg>      2           2
9                        Hello, welcome to our site!      0           0
