In [1]:
# feature_extractor.py
import urllib.parse
import pandas as pd

badwords = [
    'sleep','drop','uid','select','waitfor','delay','system',
    'union','order by','group by','insert','update','delete',
    'benchmark','and 1=1','or 1=1','--','#'
]

def ExtractFeatures(method: str, path: str, body: str = "") -> pd.DataFrame:
    """
    Return a single-row DataFrame with the numeric features in the same order as training.
    """
    # combine path and body for checking (decode percent-encoding first)
    path = urllib.parse.unquote(path or "")
    body = urllib.parse.unquote(body or "")

    combined = path + " " + body

    single_q = combined.count("'")
    double_q = combined.count('"')
    dashes   = combined.count("--")
    braces   = combined.count("(")
    spaces   = combined.count(" ")
    badwords_count = sum(combined.lower().count(w) for w in badwords)

    return pd.DataFrame([[single_q, double_q, dashes, braces, spaces, badwords_count]],
                        columns=["single_q","double_q","dashes","braces","spaces","badwords"])


In [2]:
# train_rf.py
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# load dataset
df = pd.read_csv("sample.csv")   # replace with your training CSV path

# features & target
features = ['single_q','double_q','dashes','braces','spaces','badwords']
X = df[features]
y = df['class']   # 1 = normal, 0 = malicious (as assumed)

# split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# model (balanced to handle class imbalance)
model = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# eval
pred = model.predict(X_val)
print("Classification report:")
print(classification_report(y_val, pred, digits=4))
print("Confusion matrix:")
print(confusion_matrix(y_val, pred))
print("Accuracy:", accuracy_score(y_val, pred))

# save
joblib.dump(model, "rf_model1.pkl")
print("Saved rf_model.pkl")


Classification report:
              precision    recall  f1-score   support

           0     0.9806    0.9763    0.9784       674
           1     0.9880    0.9902    0.9891      1326

    accuracy                         0.9855      2000
   macro avg     0.9843    0.9832    0.9838      2000
weighted avg     0.9855    0.9855    0.9855      2000

Confusion matrix:
[[ 658   16]
 [  13 1313]]
Accuracy: 0.9855
Saved rf_model.pkl


In [8]:
# predict_batch.py
import pandas as pd
import joblib
# from feature_extractor import ExtractFeatures

model = joblib.load("rf_model.pkl")

test_df = pd.read_csv("test.csv")
features = ['single_q','double_q','dashes','braces','spaces','badwords']
# assume test.csv already has those feature columns; if not, use ExtractFeatures on path/body

X_test = test_df[features]
# print(X_test)
test_df['pred'] = model.predict(X_test)

test_df['prob_malicious'] = model.predict_proba(X_test)[:, 0]   # prob of class 0 if 0 = malicious

print(test_df)

test_df.to_csv("test_result.csv", index=False)
# print(test_df.head())


     method                                               path  \
0      POST                                /products/view?id=1   
1       GET  /languages/english?q=%3Cimg+src%3Dx+onerror%3D...   
2      POST                                     /locations/usa   
3      POST                /tickets?q=1%3B+DROP+TABLE+users%3B   
4      POST                                            /crypto   
...     ...                                                ...   
9995    GET                                  /search?q=laptops   
9996   POST                                     /travel/hotels   
9997    GET                                  /api/v3/dashboard   
9998    GET                                 /auth/verify-email   
9999    GET                    /page/2?q=%22+OR+%221%22%3D%221   

                               body  single_q  double_q  dashes  braces  \
0              uname=test&pass=test         0         0       0       0   
1                               NaN         0         0  

In [20]:
# proxy_with_rf.py
from http.server import SimpleHTTPRequestHandler, HTTPServer
from urllib import request, error
import urllib.parse
import joblib
import sys

model = joblib.load("rf_model.pkl")

THRESHOLD = 0.25   # optional: use model.predict_proba and threshold, or use model.predict directly

class SimpleHTTPProxy(SimpleHTTPRequestHandler):
    proxy_routes ={}

    @classmethod
    def set_routes(cls, proxy_routes):
        cls.proxy_routes = proxy_routes
        
    def do_GET(self):
        parts = self.path.split("/")

        # sample => ['http:', '', 'demo.testfire.net', 'search.jsp?query=test']
        print(parts)

        #url_path => 'search.jsp?query=test'
        if len(parts) > 3:
            url_path = parts[3]
        else:
            url_path = ""

        # ExtractFeatures accept method, url_path and body as parameter
        features_df = ExtractFeatures("GET", url_path, "")

                     # single_q  double_q  dashes  braces  spaces  badwords
#0[key of dataframe]        0         0       0       0       1         0
        print(features_df) # it's a pandas dataframe of all six parameters count data

        # predict 0 or 1 right now not accurate 
        pred = model.predict(features_df)
        # probability accurate
        prob = model.predict_proba(features_df)[:, 0] 

        # # Interpret: 0=malicious, 1=normal (based on earlier assumption)
        if prob > THRESHOLD:
            print("ðŸš¨ Intrusion Detected (pred==0) â€” blocking or logging recommended")
            print("probabilities:", prob)
            # Optionally: block or modify response. For now, just log.

        # # Forward the request normally
        if len(parts) >= 3:
            self.proxy_request('http://' + parts[2] + '/')
        else:
            super().do_GET()

    def do_POST(self):
        # Parse target
        parts = self.path.split("/")
        print(parts) # => ['http:', '', 'demo.testfire.net', 'doLogin']
        target_host = parts[2] if len(parts) >= 3 else None # => 'demo.testfire.net'
        url_path = parts[3] if len(parts) > 3 else "" # => '/doLogin'

        # Read POST Body
        content_len = int(self.headers.get('Content-Length', 0)) # 35
        post_body = self.rfile.read(content_len).decode("utf-8") # uid=test&passw=test&btnSubmit=Login

        # # Extract features
        features_df = ExtractFeatures("POST", url_path, post_body)
        print("[FEATURES]\n", features_df)

        pred = model.predict(features_df)
        prob = model.predict_proba(features_df)[:, 0]
        print("[PREDICTION]", pred, prob)

        # # Detection alert
        if prob > THRESHOLD:
            print("ðŸš¨ Intrusion Detected in POST request")
            print("Blocked probability:", prob)

        # Forward POST request
        if target_host:
            self.proxy_request_post('http://' + target_host + '/', post_body)
        else:
            self.send_error(400, "Bad proxy request")

    def proxy_request(self, url):
        try:
            response = request.urlopen(url)
        except error.HTTPError as e:
            self.send_response_only(e.code)
            self.end_headers()
            return
        self.send_response_only(response.status)
        for name, value in response.headers.items():
            self.send_header(name, value)
        self.end_headers()
        self.copyfile(response, self.wfile)

    def proxy_request_post(self, url, body):
        req = request.Request(url, data=body.encode("utf-8"), method="POST")

        # Copy all headers except host
        for key in self.headers:
            if key.lower() != "host":
                req.add_header(key, self.headers[key])

        try:
            response = request.urlopen(req)
        except error.HTTPError as e:
            self.send_response_only(e.code)
            self.end_headers()
            return

        # Send response code
        self.send_response_only(response.status)

        # Forward headers
        for name, value in response.headers.items():
            self.send_header(name, value)
        self.end_headers()

        # Forward body
        self.copyfile(response, self.wfile)


SimpleHTTPProxy.set_routes({'proxy_route':'http://demo.testfire.net/'})
with HTTPServer(('127.0.0.1',8080),SimpleHTTPProxy) as httpd:
    host, port = httpd.socket.getsockname()
    print(f'listining http://{host}:{port}')
    try:
        httpd.serve_forever()
    except KeyboardInterrupt:
        print("Keyboard interrupt")

listining http://127.0.0.1:8080
['http:', '', 'demo.testfire.net', 'doLogin']
[FEATURES]
    single_q  double_q  dashes  braces  spaces  badwords
0         0         0       0       0       1         1
[PREDICTION] [1] [0.00413185]
['http:', '', 'demo.testfire.net', 'doLogin']
[FEATURES]
    single_q  double_q  dashes  braces  spaces  badwords
0         0         2       0       1       1         1
[PREDICTION] [0] [0.63913185]
ðŸš¨ Intrusion Detected in POST request
Blocked probability: [0.63913185]
['http:', '', 'demo.testfire.net', 'doLogin']
[FEATURES]
    single_q  double_q  dashes  braces  spaces  badwords
0         2         0       0       0       1         1
[PREDICTION] [0] [0.54]
ðŸš¨ Intrusion Detected in POST request
Blocked probability: [0.54]


127.0.0.1 - - [18/Nov/2025 16:04:52] Request timed out: TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None)
127.0.0.1 - - [18/Nov/2025 16:04:52] code 501, message Unsupported method ('CONNECT')
127.0.0.1 - - [18/Nov/2025 16:04:52] "CONNECT execution.metamask.io:443 HTTP/1.1" 501 -
----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 57020)
Traceback (most recent call last):
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\socketserver.py", line 318, in _handle_request_noblock
    self.process_request(request, client_address)
    ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\socketserver.py", line 349, in process_request
    self.finish_request(request, client_address)
    ~~~~~~

['http:', '', 'demo.testfire.net', 'search.jsp?query=%3Cscript%3Ealert%28%22fghf%22%29%3C%2Fscript%3E']
   single_q  double_q  dashes  braces  spaces  badwords
0         0         2       0       1       1         0
ðŸš¨ Intrusion Detected (pred==0) â€” blocking or logging recommended
probabilities: [0.285]


127.0.0.1 - - [18/Nov/2025 16:05:20] code 501, message Unsupported method ('CONNECT')
127.0.0.1 - - [18/Nov/2025 16:05:20] "CONNECT execution.metamask.io:443 HTTP/1.1" 501 -
127.0.0.1 - - [18/Nov/2025 16:06:35] code 501, message Unsupported method ('CONNECT')
127.0.0.1 - - [18/Nov/2025 16:06:35] "CONNECT execution.metamask.io:443 HTTP/1.1" 501 -
127.0.0.1 - - [18/Nov/2025 16:07:31] code 501, message Unsupported method ('CONNECT')
127.0.0.1 - - [18/Nov/2025 16:07:31] "CONNECT profile.accounts.firefox.com:443 HTTP/1.1" 501 -
127.0.0.1 - - [18/Nov/2025 16:07:31] code 501, message Unsupported method ('CONNECT')
127.0.0.1 - - [18/Nov/2025 16:07:31] "CONNECT sync-1-us-west1-g.sync.services.mozilla.com:443 HTTP/1.1" 501 -
127.0.0.1 - - [18/Nov/2025 16:07:36] code 501, message Unsupported method ('CONNECT')
127.0.0.1 - - [18/Nov/2025 16:07:36] "CONNECT merino.services.mozilla.com:443 HTTP/1.1" 501 -
127.0.0.1 - - [18/Nov/2025 16:07:36] code 501, message Unsupported method ('CONNECT')
127.0.0.1

Keyboard interrupt
