In [1]:
# %% [markdown]
# # Machine Learning-based Web Application Firewall

# %% [markdown]
# ## Imports

# %%
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from pycaret.classification import *
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from elasticsearch import Elasticsearch
import time
from urllib.parse import urlparse
import requests
from http.server import HTTPServer, BaseHTTPRequestHandler
from socketserver import ThreadingMixIn
import threading
import joblib
import streamlit as st

# %% [markdown]
# ## Data Loading and Preprocessing

# %%
class DataLoader:
    def __init__(self, file_path):
        self.file_path = file_path

    def load_and_preprocess_data(self):
        df = pd.read_csv(self.file_path)
        X = df.drop(['label', 'timestamp'], axis=1)
        y = df['label']
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        df_preprocessed = pd.DataFrame(X_scaled, columns=X.columns)
        df_preprocessed['label'] = y
        df_preprocessed['timestamp'] = df['timestamp']
        return df_preprocessed

# %% [markdown]
# ## Model Training

# %%
class ModelTrainer:
    def __init__(self):
        self.model = None

    def train_model(self, df):
        clf = setup(data=df, target='label', session_id=123, normalize=True, transformation=True, 
                    ignore_features=['timestamp'], silent=True, use_gpu=True)
        best_model = compare_models(sort='AUC')
        tuned_model = tune_model(best_model)
        self.model = finalize_model(tuned_model)
        return self.model

    def save_model(self, path):
        save_model(self.model, path)

    def load_model(self, path):
        self.model = load_model(path)

# %% [markdown]
# ## Clustering

# %%
class Clusterer:
    def __init__(self, n_clusters=5):
        self.n_clusters = n_clusters
        self.kmeans = None

    def perform_clustering(self, df):
        features = df.drop(['label', 'timestamp'], axis=1)
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=42)
        df['cluster'] = self.kmeans.fit_predict(features)
        return df

    def save_model(self, path):
        joblib.dump(self.kmeans, path)

    def load_model(self, path):
        self.kmeans = joblib.load(path)

# %% [markdown]
# ## Visualization

# %%
class Visualizer:
    @staticmethod
    def visualize_anomalies_clusters(df):
        fig, ax = plt.subplots(figsize=(12, 8))
        scatter = ax.scatter(df['feature1'], df['feature2'], c=df['cluster'], cmap='viridis', alpha=0.7)
        ax.scatter(df[df['label'] == 1]['feature1'], df[df['label'] == 1]['feature2'], color='red', marker='x', s=100, label='Anomaly')
        ax.set_title('Anomalies in Clusters')
        ax.set_xlabel('Feature 1')
        ax.set_ylabel('Feature 2')
        ax.legend()
        plt.colorbar(scatter, label='Cluster')
        return fig

# %% [markdown]
# ## Elasticsearch Logger

# %%
class ElasticsearchLogger:
    def __init__(self, es_host='localhost', es_port=9200, index_name='waf_logs'):
        self.es = Elasticsearch([{'host': es_host, 'port': es_port}])
        self.index_name = index_name

    def log_request(self, request_data, is_intrusion):
        doc = {
            'timestamp': time.time(),
            'method': request_data['method'],
            'path': request_data['path'],
            'is_intrusion': is_intrusion
        }
        self.es.index(index=self.index_name, body=doc)

# %% [markdown]
# ## DDoS Protection

# %%
class DDoSProtection:
    def __init__(self, time_window=60, request_limit=100):
        self.time_window = time_window
        self.request_limit = request_limit
        self.request_log = {}

    def is_ddos_attack(self, client_ip):
        current_time = time.time()
        if client_ip not in self.request_log:
            self.request_log[client_ip] = []
        self.request_log[client_ip] = [t for t in self.request_log[client_ip] if current_time - t < self.time_window]
        self.request_log[client_ip].append(current_time)
        return len(self.request_log[client_ip]) > self.request_limit

# %% [markdown]
# ## URL Filtering

# %%
def is_malicious_url(url):
    malicious_domains = ['malware.com', 'phishing.com', 'spam.com']
    parsed_url = urlparse(url)
    return parsed_url.netloc in malicious_domains

# %% [markdown]
# ## Intrusion Detection

# %%
def detect_intrusion(request_data, model):
    features = pd.DataFrame([{
        'method': request_data['method'],
        'path_length': len(request_data['path']),
        'header_count': len(request_data['headers']),
        'body_length': len(request_data['body'])
    }])
    prediction = predict_model(model, data=features)
    return prediction['prediction_label'][0] == 1

# %% [markdown]
# ## Proxy Server

# %%
class MLProxyHandler(BaseHTTPRequestHandler):
    def __init__(self, *args, **kwargs):
        self.model = kwargs.pop('model')
        self.es_logger = kwargs.pop('es_logger')
        self.ddos_protection = kwargs.pop('ddos_protection')
        super().__init__(*args, **kwargs)

    def do_METHOD(self):
        client_ip = self.client_address[0]
        if self.ddos_protection.is_ddos_attack(client_ip):
            self.send_error(429, "Too Many Requests")
            return

        method = self.command
        path = self.path
        headers = self.headers
        body = self.rfile.read(int(self.headers.get('Content-Length', 0)))

        request_data = {
            'method': method,
            'path': path,
            'headers': dict(headers),
            'body': body.decode('utf-8')
        }

        is_intrusion = detect_intrusion(request_data, self.model)
        self.es_logger.log_request(request_data, is_intrusion)

        if is_intrusion or is_malicious_url(path):
            self.send_error(403, "Potential intrusion detected")
        else:
            try:
                response = requests.request(
                    method=method,
                    url=f"http://localhost:8080{path}",  # Replace with your target server
                    headers=headers,
                    data=body,
                    timeout=5
                )
                self.send_response(response.status_code)
                for header, value in response.headers.items():
                    self.send_header(header, value)
                self.end_headers()
                self.wfile.write(response.content)
            except Exception as e:
                self.send_error(500, str(e))

    do_GET = do_POST = do_PUT = do_DELETE = do_METHOD

class ThreadedHTTPServer(ThreadingMixIn, HTTPServer):
    pass

# %% [markdown]
# ## Main Function

# %%
def main():
    print("Machine Learning-based Web Application Firewall")

    data_loader = DataLoader("/kaggle/input/demo123")  # Replace with your dataset path
    df = data_loader.load_and_preprocess_data()

    model_trainer = ModelTrainer()
    model = model_trainer.train_model(df)
    model_trainer.save_model("waf_model.pkl")

    clusterer = Clusterer()
    df_clustered = clusterer.perform_clustering(df)
    clusterer.save_model("kmeans_model.pkl")

    print("Anomalies in Clusters")
    fig = Visualizer.visualize_anomalies_clusters(df_clustered)
    plt.show()

    es_logger = ElasticsearchLogger()
    ddos_protection = DDoSProtection()

    print("Proxy Server")
    handler = lambda *args, **kwargs: MLProxyHandler(*args, model=model, es_logger=es_logger, ddos_protection=ddos_protection, **kwargs)
    server = ThreadedHTTPServer(('localhost', 8000), handler)
    print("Proxy server is running on http://localhost:8000")
    server.serve_forever()

# %% [markdown]
# ## Run the Application

# %%
if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'pycaret'

In [2]:
pip install pycaret


[0m[31mERROR: Could not find a version that satisfies the requirement pycaret (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pycaret[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
