In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import urllib.parse
import numpy as np
import pickle

# List of bad words to check in the URL path
badwords = ['sleep', 'uid', 'select', 'waitfor', 'delay', 'system', 'union', 'order by', 'group by', 'admin', 'drop', 'script']

# Function to extract features from the URL path and body
def ExtractFeatures(path, body):
    path = str(path)
    body = str(body)
    combined_raw = path + body
    raw_percentages = combined_raw.count("%")
    raw_spaces = combined_raw.count(" ")

    # Check if both counts exceed the threshold
    raw_percentages_count = raw_percentages if raw_percentages > 3 else 0
    raw_spaces_count = raw_spaces if raw_spaces > 3 else 0

    # Decode the path and body for other feature extractions
    path_decoded = urllib.parse.unquote_plus(path)
    body_decoded = urllib.parse.unquote_plus(body)

    single_q = path_decoded.count("'") + body_decoded.count("'")
    double_q = path_decoded.count("\"") + body_decoded.count("\"")
    dashes = path_decoded.count("--") + body_decoded.count("--")
    braces = path_decoded.count("(") + body_decoded.count("(")
    spaces = path_decoded.count(" ") + body_decoded.count(" ")
    semicolons = path_decoded.count(";") + body_decoded.count(";")
    angle_brackets = path_decoded.count("<") + path_decoded.count(">") + body_decoded.count("<") + body_decoded.count(">")
    special_chars = sum(path_decoded.count(c) + body_decoded.count(c) for c in '$&|')

    badwords_count = sum(path_decoded.lower().count(word) + body_decoded.lower().count(word) for word in badwords)

    path_length = len(path_decoded)
    body_length = len(body_decoded)

    return [single_q, double_q, dashes, braces, spaces, raw_percentages_count, semicolons, angle_brackets, special_chars, path_length, body_length, badwords_count]

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import urllib.parse
import pickle

# Load dataset
http = pd.read_csv(r'C:\Users\PRATHAM\Documents\python project\All_data.csv')
missing_values = http.isna().sum()
print(missing_values)

# Check if the necessary columns exist
required_columns = ['path', 'body', 'class']
missing_columns = [col for col in required_columns if col not in http.columns]

if missing_columns:
    raise ValueError(f"Error: The dataset is missing the following columns: {', '.join(missing_columns)}")

# Handle missing values (fill with mean for numeric columns as an example)
# Note: Fill numeric columns only, if necessary
#http.fillna(http.mean(), inplace=True)

# Dummy badwords list for the example; replace it with actual bad words list
badwords = ['badword1', 'badword2']


# Extract features from the 'path' and 'body' columns
http['features'] = http.apply(lambda row: ExtractFeatures(row['path'], row['body']), axis=1)

# Prepare the feature matrix and the labels
X = np.array(http['features'].tolist())
y = http['class'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

filename = 'training_model.pkl'
pickle.dump(model, open(filename, 'wb'))

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

method              0
path                0
body              422
single_q            0
double_q            0
dashes              0
braces              0
spaces              0
percentages         0
semicolons          0
angle_brackets      0
special_chars       0
path_length         0
body_length         0
badwords_count      0
class               0
dtype: int64
Accuracy: 0.97
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        57
           1       1.00      0.93      0.96        43

    accuracy                           0.97       100
   macro avg       0.97      0.97      0.97       100
weighted avg       0.97      0.97      0.97       100



In [11]:
from http.server import SimpleHTTPRequestHandler, HTTPServer
from urllib import request, error, parse
import numpy as np
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

badwords = ['sleep', 'uid', 'select', 'waitfor', 'delay', 'system', 'union', 'order by', 'group by', 'admin', 'drop', 'script']
# Define the ExtractFeatures function outside the class
def ExtractFeatures(path, body):
    path = str(path)
    body = str(body)
    combined_raw = path + body
    raw_percentages = combined_raw.count("%")
    raw_spaces = combined_raw.count(" ")

    # Check if both counts exceed the threshold
    raw_percentages_count = raw_percentages if raw_percentages > 3 else 0
    raw_spaces_count = raw_spaces if raw_spaces > 3 else 0

    # Decode the path and body for other feature extractions
    path_decoded = urllib.parse.unquote_plus(path)
    body_decoded = urllib.parse.unquote_plus(body)

    single_q = path_decoded.count("'") + body_decoded.count("'")
    double_q = path_decoded.count("\"") + body_decoded.count("\"")
    dashes = path_decoded.count("--") + body_decoded.count("--")
    braces = path_decoded.count("(") + body_decoded.count("(")
    spaces = path_decoded.count(" ") + body_decoded.count(" ")
    semicolons = path_decoded.count(";") + body_decoded.count(";")
    angle_brackets = path_decoded.count("<") + path_decoded.count(">") + body_decoded.count("<") + body_decoded.count(">")
    special_chars = sum(path_decoded.count(c) + body_decoded.count(c) for c in '$&|')

    badwords_count = sum(path_decoded.lower().count(word) + body_decoded.lower().count(word) for word in badwords)

    path_length = len(path_decoded)
    body_length = len(body_decoded)

    return [single_q, double_q, dashes, braces, spaces, raw_percentages_count, semicolons, angle_brackets, special_chars, path_length, body_length, badwords_count]

# Define the SimpleHTTPProxy class
class SimpleHTTPProxy(SimpleHTTPRequestHandler):
    proxy_routes = {}

    @classmethod
    def set_routes(cls, proxy_routes):
        cls.proxy_routes = proxy_routes

    def do_GET(self):
        parts = self.path.split('/')
        print(parts)
        if len(parts) > 3:
            path_part = parts[3]
            body = ""  # GET requests typically do not have a body
            live_data = ExtractFeatures(path_part, body)
            live_data = np.array(live_data).reshape(1, -1)  # Reshape for single prediction
            
            # Load the model inside the request handler
            with open('training_model.pkl', 'rb') as file:
                model = pickle.load(file)

            result = model.predict(live_data)  # Use the trained model for prediction
            print(result[0])
            if result[0] == 1:
                print('Intrusion Detected')
        
        if len(parts) >= 2:
            self.proxy_request('http://' + parts[2] + '/')
        else:
            super().do_GET()

    def proxy_request(self, url):
        try:
            response = request.urlopen(url)
        except error.HTTPError as e:
            print('err')
            self.send_response_only(e.code)
            self.end_headers()
            return
        self.send_response_only(response.status)
        for name, value in response.headers.items():
            self.send_header(name, value)
        self.end_headers()
        self.copyfile(response, self.wfile)

# Set up and start the server
SimpleHTTPProxy.set_routes({'proxy_route': 'http://demo.testfire.net/'})
with HTTPServer(('127.0.0.1', 8080), SimpleHTTPProxy) as httpd:  # Correct reference to HTTPServer
    host, port = httpd.socket.getsockname()
    print(f'Listening on http://{host}:{port}')
    try:
        httpd.serve_forever()  # Corrected from serveforever to serve_forever
    except KeyboardInterrupt:  # Corrected from keyboardInterrupt to KeyboardInterrupt
        print("\nKeyboard interrupt received, exiting.")

Listening on http://127.0.0.1:8080
['http:', '', 'detectportal.firefox.com', 'canonical.html']
0
['http:', '', 'detectportal.firefox.com', 'canonical.html']
0
['http:', '', 'detectportal.firefox.com', 'canonical.html']
0


127.0.0.1 - - [27/May/2024 11:51:00] code 501, message Unsupported method ('CONNECT')
127.0.0.1 - - [27/May/2024 11:51:00] "CONNECT sb-ssl.google.com:443 HTTP/1.1" 501 -


['http:', '', 'detectportal.firefox.com', 'canonical.html']
0
['http:', '', 'detectportal.firefox.com', 'canonical.html']
0
['http:', '', 'demo.testfire.net', 'search.jsp?query=1234+%27+AND+1%3D0+UNION+ALL+SELECT+%27admin%27%2C+%2781dc9bdb52d04dc20036dbd8313ed055']
1
Intrusion Detected
['http:', '', 'detectportal.firefox.com', 'canonical.html']
0
['http:', '', 'detectportal.firefox.com', 'canonical.html']
0
['http:', '', 'detectportal.firefox.com', 'canonical.html']
0


127.0.0.1 - - [27/May/2024 11:51:16] code 501, message Unsupported method ('CONNECT')
127.0.0.1 - - [27/May/2024 11:51:16] "CONNECT services.addons.mozilla.org:443 HTTP/1.1" 501 -


['http:', '', 'detectportal.firefox.com', 'canonical.html']
0
['http:', '', 'detectportal.firefox.com', 'canonical.html']
0
['http:', '', 'detectportal.firefox.com', 'canonical.html']
0
['http:', '', 'detectportal.firefox.com', 'canonical.html']
0

Keyboard interrupt received, exiting.


In [6]:
testing_data = pd.read_csv(r'P:\WAF\Testing_data.csv')
if 'path' in testing_data.columns and 'body' in testing_data.columns:
        # Extract features from the testing data paths and bodies
    test_features = testing_data.apply(lambda row: ExtractFeatures(row['path'], row['body']), axis=1).tolist()

        # Convert test_features to a 2D numpy array
    test_features = np.array(test_features)

        # Predict whether each data point is good or bad
    predictions = model.predict(test_features)

        # Add the predictions to the testing data DataFrame
    testing_data['Prediction'] = predictions

        # Save the testing data with predictions to a new CSV file
    testing_data.to_csv(r'P:\WAF\Testing_result.csv', index=False)
    print("Predictions saved to testing_datas_with_predictions.csv")
else:
    print("Error: The testing data must contain 'path' and 'body' columns.")

Predictions saved to testing_datas_with_predictions.csv
