In [2]:
!pip uninstall -y xgboost
!pip install --no-cache-dir xgboost


Found existing installation: xgboost 3.0.0
Uninstalling xgboost-3.0.0:
  Successfully uninstalled xgboost-3.0.0
Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-macosx_12_0_arm64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.0


In [3]:
!python --version
!pip --version


Python 3.11.7
pip 23.3.1 from /opt/anaconda3/lib/python3.11/site-packages/pip (python 3.11)


In [4]:
import sys
!{sys.executable} -m pip install xgboost


Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.4


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import urllib.parse
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from transformers import BertTokenizer, TFBertForSequenceClassification
import torch

def load_datasets():
    file_paths = {
        "Benign": "Benign_list_big_final.csv",
        "Defacement": "DefacementSitesURLFiltered.csv",
        "Malware": "Malware_dataset.csv",
        "Phishing": "phishing_dataset.csv",
        "Spam": "spam_dataset.csv",
    }
    
    dataframes = []
    for label, path in file_paths.items():
        df = pd.read_csv(path, encoding='utf-8', error_bad_lines=False)
        df['Label'] = label
        dataframes.append(df)
    
    return pd.concat(dataframes, ignore_index=True)

def preprocess_data(df):
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    return df

def extract_features(df):
    df['url_length'] = df['URL'].apply(len)
    df['num_digits'] = df['URL'].apply(lambda x: sum(c.isdigit() for c in x))
    df['num_special_chars'] = df['URL'].apply(lambda x: len(re.findall(r'[^a-zA-Z0-9]', x)))
    return df

def train_ml_models(X_train, X_test, y_train, y_test):
    models = {
        'RandomForest': RandomForestClassifier(),
        'SVM': SVC(),
        'XGBoost': XGBClassifier()
    }
    
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results[name] = {
            'Accuracy': accuracy_score(y_test, y_pred),
            'Confusion Matrix': confusion_matrix(y_test, y_pred),
            'Classification Report': classification_report(y_test, y_pred)
        }
    
    return results

def train_lstm(X_train, y_train, X_test, y_test):
    model = Sequential([
        Embedding(input_dim=5000, output_dim=64, input_length=100),
        SpatialDropout1D(0.2),
        LSTM(100, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))
    return model

def visualize_results(results):
    for model, res in results.items():
        print(f'\n{model} Performance:')
        print(f"Accuracy: {res['Accuracy']:.2f}")
        sns.heatmap(res['Confusion Matrix'], annot=True, fmt='d')
        plt.title(f'{model} Confusion Matrix')
        plt.show()

def main():
    df = load_datasets()
    df = preprocess_data(df)
    df = extract_features(df)
    X = df[['url_length', 'num_digits', 'num_special_chars']]
    y = df['Label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    results = train_ml_models(X_train, X_test, y_train, y_test)
    visualize_results(results)

if __name__ == "__main__":
    main()

